From 0343b128b86251b8a786d21cbac4c83724b41f5a Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <rabhagat31@gmail.com>
Date: Mon, 18 Nov 2024 11:35:31 -0800
Subject: [PATCH] Add workflows to build a commit and run tpch benchmarks

- slightly different than existing one
---
 .github/actions/install/action.yaml          |  29 ++++
 .github/assets/ray.yaml                      |  65 +++++++++
 .github/scripts/csv_to_md.py                 |  29 ++++
 .github/workflows/build-commit-run-tpch.yaml |  41 ++++++
 .github/workflows/build-commit.yaml          |  66 +++++++++
 .github/workflows/run-tpch.yaml              | 146 +++++++++++++++++++
 benchmarking/tpch/__main__.py                |  15 +-
 7 files changed, 388 insertions(+), 3 deletions(-)
 create mode 100644 .github/actions/install/action.yaml
 create mode 100644 .github/assets/ray.yaml
 create mode 100644 .github/scripts/csv_to_md.py
 create mode 100644 .github/workflows/build-commit-run-tpch.yaml
 create mode 100644 .github/workflows/build-commit.yaml
 create mode 100644 .github/workflows/run-tpch.yaml
diff --git a/.github/actions/install/action.yaml b/.github/actions/install/action.yaml
new file mode 100644
index 0000000000..543d9a3e86
--- /dev/null
+++ b/.github/actions/install/action.yaml
@@ -0,0 +1,29 @@
+name: Install uv, rust, and python
+description: Install uv, rust, and python
+inputs:
+  python_version:
+    description: The version of python to install
+    required: false
+    default: '3.9'
+runs:
+  using: composite
+  steps:
+  - shell: bash
+    run: |
+      curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+      CARGO_BIN="$HOME/.cargo/bin"
+      echo 'export PATH="$CARGO_BIN:$PATH"' >> $HOME/.bashrc
+      echo "$CARGO_BIN" >> $GITHUB_PATH
+  - shell: bash
+    run: |
+      curl -LsSf https://astral.sh/uv/install.sh | sh
+      UV_BIN="$HOME/.local/bin"
+      echo 'export PATH="$UV_BIN:$PATH"' >> $HOME/.bashrc
+      echo "$UV_BIN" >> $GITHUB_PATH
+  - shell: bash
+    run: |
+      source $HOME/.bashrc
+  - shell: bash
+    run: |
+      uv python install ${{ inputs.python_version }}
+      uv python pin ${{ inputs.python_version }}
diff --git a/.github/assets/ray.yaml b/.github/assets/ray.yaml
new file mode 100644
index 0000000000..486b72bad4
--- /dev/null
+++ b/.github/assets/ray.yaml
@@ -0,0 +1,65 @@
+cluster_name: performance-comparisons
+
+provider:
+  type: aws
+  region: us-west-2
+  cache_stopped_nodes: true
+  security_group:
+    GroupName: ray-autoscaler-c1
+
+auth:
+  ssh_user: ubuntu
+  ssh_private_key: ~/.ssh/ci-github-actions-ray-cluster-key.pem
+
+max_workers: 2
+available_node_types:
+  ray.head.default:
+    resources: {"CPU": 0}
+    node_config:
+      KeyName: ci-github-actions-ray-cluster-key
+      InstanceType: i3.2xlarge
+      ImageId: ami-04dd23e62ed049936
+      IamInstanceProfile:
+        Name: ray-autoscaler-v1
+
+  ray.worker.default:
+    min_workers: 2
+    max_workers: 2
+    resources: {}
+    node_config:
+      KeyName: ci-github-actions-ray-cluster-key
+      InstanceType: i3.2xlarge
+      ImageId: ami-04dd23e62ed049936
+      IamInstanceProfile:
+        Name: ray-autoscaler-v1
+
+setup_commands:
+# Mount drive
+- |
+  findmnt /tmp 1> /dev/null
+  code=$?
+  if [ $code -ne 0 ]; then
+    sudo mkfs.ext4 /dev/nvme0n1
+    sudo mount -t ext4 /dev/nvme0n1 /tmp
+    sudo chmod 777 /tmp
+  fi
+# Install dependencies
+- sudo snap install aws-cli --classic
+- curl -LsSf https://astral.sh/uv/install.sh | sh
+- echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc
+- source ~/.bashrc
+- uv python install <<PYTHON_VERSION>>
+- uv python pin <<PYTHON_VERSION>>
+- uv v
+- echo "source $HOME/.venv/bin/activate" >> $HOME/.bashrc
+- source .venv/bin/activate
+- uv pip install pip ray[default] py-spy
+# GitHub Actions workflow will replace all parameters between `<<...>>` with the
+# actual values as determined dynamically during runtime of the actual workflow.
+- uv pip install https://github-actions-artifacts-bucket.s3.us-west-2.amazonaws.com/builds/<<SHA>>/<<WHEEL>>
+# Download benchmarking fixtures
+- |
+  aws s3 sync \
+  s3://eventual-dev-benchmarking-fixtures/uncompressed/tpch-dbgen/<<SCALE_FACTOR>>/<<PARTITION_SIZE>>/parquet/ \
+  /tmp/data/<<SCALE_FACTOR>>/<<PARTITION_SIZE>>/parquet/ \
+  --quiet
diff --git a/.github/scripts/csv_to_md.py b/.github/scripts/csv_to_md.py
new file mode 100644
index 0000000000..b9a74f4f40
--- /dev/null
+++ b/.github/scripts/csv_to_md.py
@@ -0,0 +1,29 @@
+import csv
+import sys
+from pathlib import Path
+
+file = Path(sys.argv[1])
+assert file.exists()
+
+output = Path(sys.argv[2])
+assert not output.exists()
+
+
+def make_md_row(row: list[str]) -> str:
+    return f'|{"|".join(row)}|\n'
+
+
+with open(file) as file:
+    with open(output, "w+") as output:
+        csv_reader = csv.reader(file)
+        header = next(csv_reader)
+
+        header_str = make_md_row(header)
+        output.write(header_str)
+
+        separator_str = make_md_row(["-"] * len(header))
+        output.write(separator_str)
+
+        for row in csv_reader:
+            row_str = make_md_row(row)
+            output.write(row_str)
diff --git a/.github/workflows/build-commit-run-tpch.yaml b/.github/workflows/build-commit-run-tpch.yaml
new file mode 100644
index 0000000000..52102c4196
--- /dev/null
+++ b/.github/workflows/build-commit-run-tpch.yaml
@@ -0,0 +1,41 @@
+name: Build commit and run tpch benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      wheel:
+        description: The wheel artifact to use
+        required: false
+        default: getdaft-0.3.0.dev0-cp38-abi3-manylinux_2_31_x86_64.whl
+      skip_questions:
+        description: The TPC-H questions to skip
+        required: false
+        default: ""
+      scale_factor:
+        description: Which scale factor to use
+        required: false
+        default: 2
+      partition_size:
+        description: Which partition size to use
+        required: false
+        default: 2
+      python_version:
+        description: The version of python to use
+        required: false
+        default: '3.9'
+
+jobs:
+  build:
+    uses: ./.github/workflows/build-commit.yaml
+    secrets:
+      ACTIONS_AWS_ROLE_ARN: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
+
+  run:
+    needs: build
+    uses: ./.github/workflows/run-tpch.yaml
+    with:
+      wheel: ${{ needs.build.outputs.wheel }}
+      skip_questions: ${{ inputs.skip_questions }}
+      scale_factor: ${{ inputs.scale_factor }}
+      partition_size: ${{ inputs.partition_size }}
+      python_version: ${{ inputs.python_version }}
diff --git a/.github/workflows/build-commit.yaml b/.github/workflows/build-commit.yaml
new file mode 100644
index 0000000000..89186738d1
--- /dev/null
+++ b/.github/workflows/build-commit.yaml
@@ -0,0 +1,66 @@
+name: Build a Daft commit and store the outputted wheel in AWS S3
+
+on:
+  workflow_dispatch:
+  workflow_call:
+    secrets:
+      ACTIONS_AWS_ROLE_ARN:
+        description: The ARN of the AWS role to assume
+        required: true
+    outputs:
+      wheel:
+        description: The wheel file that was built
+        value: ${{ jobs.build-commit.outputs.wheel }}
+
+jobs:
+  build-commit:
+    runs-on: buildjet-8vcpu-ubuntu-2004
+    timeout-minutes: 15 # Remove for ssh debugging
+    permissions:
+      id-token: write
+      contents: read
+    outputs:
+      wheel: ${{ steps.build_and_upload.outputs.wheel }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 1
+    - uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: us-west-2
+        role-session-name: build-commit-workflow
+        role-to-assume: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
+    - uses: ./.github/actions/install
+    - uses: buildjet/cache@v4
+      with:
+        path: ~/target
+        key: ${{ runner.os }}-cargo-deps-${{ hashFiles('**/Cargo.lock') }}
+        restore-keys: ${{ runner.os }}-cargo-deps-
+    - id: build_and_upload
+      run: |
+        if ! ls ~/target/wheels/*.whl 1> /dev/null 2>&1; then
+          # Build wheel
+          export CARGO_TARGET_DIR=~/target
+          uv v
+          source .venv/bin/activate
+          uv pip install pip maturin
+          maturin build --release
+        fi
+
+        count=$(ls ~/target/wheels/*.whl 2> /dev/null | wc -l)
+        if [ "$count" -gt 1 ]; then
+          echo "Found more than 1 wheel"
+          exit 1
+        elif [ "$count" -eq 0 ]; then
+          echo "Found no wheel files"
+          exit 1
+        fi
+
+        # Upload wheel
+        for file in ~/target/wheels/*.whl; do
+          aws s3 cp $file s3://github-actions-artifacts-bucket/builds/${{ github.sha }}/ --acl public-read --no-progress;
+          file_basename=$(basename $file)
+          echo "wheel=$file_basename" >> $GITHUB_OUTPUT
+          echo "Output wheel has been built and stored in S3 at the following location:" >> $GITHUB_STEP_SUMMARY
+          echo "https://us-west-2.console.aws.amazon.com/s3/buckets/github-actions-artifacts-bucket?prefix=builds/${{ github.sha }}/" >> $GITHUB_STEP_SUMMARY
+        done
diff --git a/.github/workflows/run-tpch.yaml b/.github/workflows/run-tpch.yaml
new file mode 100644
index 0000000000..73ee503cb9
--- /dev/null
+++ b/.github/workflows/run-tpch.yaml
@@ -0,0 +1,146 @@
+name: Run tpch benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      wheel:
+        description: The wheel artifact to use
+        required: false
+        default: getdaft-0.3.0.dev0-cp38-abi3-manylinux_2_31_x86_64.whl
+      skip_questions:
+        description: The TPC-H questions to skip
+        required: false
+        default: ""
+      scale_factor:
+        description: Which scale factor to use
+        required: false
+        default: 2
+      partition_size:
+        description: Which partition size to use
+        required: false
+        default: 2
+      python_version:
+        description: The version of python to use
+        required: false
+        default: '3.9'
+  workflow_call:
+    inputs:
+      wheel:
+        type: string
+        description: The wheel artifact to use
+        required: false
+        default: getdaft-0.3.0.dev0-cp38-abi3-manylinux_2_31_x86_64.whl
+      skip_questions:
+        type: string
+        description: The TPC-H questions to skip
+        required: false
+        default: ""
+      scale_factor:
+        type: string
+        description: Which scale factor to use
+        required: false
+        default: 2
+      partition_size:
+        type: string
+        description: Which partition size to use
+        required: false
+        default: 2
+      python_version:
+        type: string
+        description: The version of python to use
+        required: false
+        default: '3.9'
+
+jobs:
+  run-tpch:
+    runs-on: [self-hosted, linux, x64, ci-dev]
+    timeout-minutes: 15 # Remove for ssh debugging
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 1
+    - uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: us-west-2
+        role-session-name: run-tpch-workflow
+    - uses: ./.github/actions/install
+    - run: |
+        scale_factor_str="${{ inputs.scale_factor }}_0"
+
+        # Dynamically update ray config file
+        sed -i 's|<<SHA>>|${{ github.sha }}|g' .github/assets/ray.yaml
+        sed -i 's|<<WHEEL>>|${{ inputs.wheel }}|g' .github/assets/ray.yaml
+        sed -i "s|<<SCALE_FACTOR>>|$scale_factor_str|g" .github/assets/ray.yaml
+        sed -i 's|<<PARTITION_SIZE>>|${{ inputs.partition_size }}|g' .github/assets/ray.yaml
+        sed -i 's|<<PYTHON_VERSION>>|${{ inputs.python_version }}|g' .github/assets/ray.yaml
+
+        # Download private ssh key
+        KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text)
+        echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem
+        chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem
+
+        # Install dependencies
+        uv v
+        source .venv/bin/activate
+        rm -rf daft
+        uv pip install ray[default] boto3 https://github-actions-artifacts-bucket.s3.us-west-2.amazonaws.com/builds/${{ github.sha }}/${{ inputs.wheel }}
+
+        # Boot up ray cluster and submit tpch benchmarking job
+        ray up .github/assets/ray.yaml -y
+        HEAD_NODE_IP=$(ray get-head-ip .github/assets/ray.yaml | tail -n 1)
+        ssh -o StrictHostKeyChecking=no -fN -L 8265:localhost:8265 -i ~/.ssh/ci-github-actions-ray-cluster-key.pem ubuntu@$HEAD_NODE_IP
+        export DAFT_ENABLE_RAY_TRACING=1
+        export DAFT_RUNNER=ray
+        if [[ -n "${{ inputs.skip_questions }}" ]]; then
+          python -m benchmarking.tpch \
+            --scale_factor ${{ inputs.scale_factor }} \
+            --num_parts ${{ inputs.partition_size }} \
+            --parquet_file_cache /tmp/data \
+            --output_csv output.csv \
+            --ray_job_dashboard_url http://localhost:8265 \
+            --skip_warmup \
+            --no_pymodules \
+            --skip_questions="${{ inputs.skip_questions }}"
+        else
+          python -m benchmarking.tpch \
+            --scale_factor ${{ inputs.scale_factor }} \
+            --num_parts ${{ inputs.partition_size }} \
+            --parquet_file_cache /tmp/data \
+            --output_csv output.csv \
+            --ray_job_dashboard_url http://localhost:8265 \
+            --skip_warmup \
+            --no_pymodules
+        fi
+
+        # Download all logs
+        #
+        # We also need to convert all files containing ':' to '_'.
+        # GHA `actions/upload-artifact@v4` does *not* allow semicolons!
+        ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs/daft ray-daft-logs
+        find ray-daft-logs -depth -name '*:*' -exec bash -c '
+        for filepath; do
+          dir=$(dirname "$filepath")
+          base=$(basename "$filepath")
+          new_base=${base//:/_}
+          mv "$filepath" "$dir/$new_base"
+        done
+        ' _ {} +
+
+        # Tear down ray cluster
+        ray down .github/assets/ray.yaml -y
+
+        # Convert csv to markdown and print to GHA Summary Page
+        python .github/scripts/csv_to_md.py output.csv output.md
+        echo "# Results" >> $GITHUB_STEP_SUMMARY
+        cat output.md >> $GITHUB_STEP_SUMMARY
+    - uses: actions/upload-artifact@v4
+      with:
+        name: output.csv
+        path: output.csv
+    - uses: actions/upload-artifact@v4
+      with:
+        name: ray-daft-logs
+        path: ray-daft-logs
diff --git a/benchmarking/tpch/__main__.py b/benchmarking/tpch/__main__.py
index 8ad131e08f..125712ef92 100644
--- a/benchmarking/tpch/__main__.py
+++ b/benchmarking/tpch/__main__.py
@@ -126,6 +126,7 @@ def run_all_benchmarks(
     csv_output_location: str | None,
     ray_job_dashboard_url: str | None = None,
     requirements: str | None = None,
+    no_pymodules: bool = False,
 ):
     get_df = get_df_with_parquet_folder(parquet_folder)
 
@@ -143,7 +144,7 @@ def run_all_benchmarks(
                 tpch_qnum=i,
                 working_dir=working_dir,
                 entrypoint=entrypoint,
-                runtime_env=get_ray_runtime_env(requirements),
+                runtime_env=get_ray_runtime_env(requirements, no_pymodules=no_pymodules),
             )
 
             # Run once as a warmup step
@@ -202,11 +203,13 @@ def get_daft_benchmark_runner_name() -> Literal["ray"] | Literal["py"] | Literal
     return name
 
 
-def get_ray_runtime_env(requirements: str | None) -> dict:
+def get_ray_runtime_env(requirements: str | None, no_pymodules: bool = False) -> dict:
+    daft_env_variables = dict(filter(lambda key_value: key_value[0].startswith("DAFT"), os.environ.items()))
     runtime_env = {
-        "py_modules": [daft],
+        "py_modules": None if no_pymodules else [daft],
         "eager_install": True,
         "env_vars": {
+            **daft_env_variables,
             "DAFT_PROGRESS_BAR": "0",
             "DAFT_RUNNER": "ray",
         },
@@ -293,6 +296,11 @@ def warm_up_function():
         default=None,
         help="Ray Dashboard URL to submit jobs instead of using Ray client, most useful when running on a remote cluster",
     )
+    parser.add_argument(
+        "--no_pymodules",
+        action="store_true",
+        help="Avoid pickling any modules in the ray-environment before initializing the Ray cluster; useful in CI",
+    )
 
     args = parser.parse_args()
     if args.output_csv_headers:
@@ -331,4 +339,5 @@ def warm_up_function():
         csv_output_location=args.output_csv,
         ray_job_dashboard_url=args.ray_job_dashboard_url,
         requirements=args.requirements,
+        no_pymodules=True if args.no_pymodules else False,
     )