Skip to content

run-cluster

run-cluster #30

Workflow file for this run

name: run-cluster
on:
workflow_dispatch:
inputs:
daft_version:
type: string
description: The version of daft to install
required: false
python_version:
type: string
description: The version of python to use
required: false
default: "3.9"
command:
type: string
description: The command to run on the cluster
required: true
working_dir:
type: string
description: The working directory to submit to the cluster
required: false
default: .github/working-dir
jobs:
run-command:
runs-on: [self-hosted, linux, x64, ci-dev]
# timeout-minutes: 15 # Remove for ssh debugging
permissions:
id-token: write
contents: read
steps:
- name: Checkout repo
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-west-2
role-session-name: run-command-workflow
- name: Install uv, rust, python
uses: ./.github/actions/install
with:
python_version: ${{ inputs.python_version }}
- name: Setup uv environment
run: |
uv v
source .venv/bin/activate
uv pip install ray[default] boto3
- name: Dynamically update ray config file
run: |
id="ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}"
sed -i "s|{{RAY_CLUSTER_NAME}}|$id|g" .github/assets/benchmarking_ray_config.yaml
sed -i 's|{{PYTHON_VERSION}}|${{ inputs.python_version }}|g' .github/assets/benchmarking_ray_config.yaml
if [[ '${{ inputs.daft_version }}' ]]; then
sed -i 's|{{DAFT_VERSION}}|==${{ inputs.daft_version }}|g' .github/assets/benchmarking_ray_config.yaml
else
sed -i 's|{{DAFT_VERSION}}||g' .github/assets/benchmarking_ray_config.yaml
fi
- name: Download private ssh key
run: |
KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text)
echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem
chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem
- name: Spin up ray cluster
run: |
source .venv/bin/activate
ray up .github/assets/benchmarking_ray_config.yaml -y
- name: Setup connection to ray cluster
run: |
source .venv/bin/activate
ray dashboard .github/assets/benchmarking_ray_config.yaml &
- name: Setup upterm session
uses: lhotari/action-upterm@v1
- name: Submit job to ray cluster
run: |
source .venv/bin/activate
if [[ -z '${{ inputs.command }}' ]]; then
echo 'Invalid command submitted; command cannot be empty'
exit 1
fi
ray job submit \
--working-dir ${{ inputs.working_dir }} \
--address http://localhost:8265 \
--runtime-env-json '{"env_vars": {"DAFT_ENABLE_RAY_TRACING": "1"}}' \
-- ${{ inputs.command }}
- name: Download log files from ray cluster
run: |
source .venv/bin/activate
ray rsync-down .github/assets/benchmarking_ray_config.yaml /tmp/ray/session_*/logs ray-daft-logs
- name: Kill connection to ray cluster
run: |
PID=$(lsof -t -i:8265)
if [[ -n "$PID" ]]; then
echo "Process $PID is listening on port 8265; killing it..."
kill -9 "$PID"
if [[ $? -eq 0 ]]; then
echo "Process $PID killed successfully"
else
echo "Failed to kill process $PID"
fi
fi
- name: Spin down ray cluster
if: always()
run: |
source .venv/bin/activate
ray down .github/assets/benchmarking_ray_config.yaml -y
- name: Upload log files
uses: actions/upload-artifact@v4
with:
name: ray-daft-logs
path: ray-daft-logs