Skip to content

run-cluster

run-cluster #190

Workflow file for this run

name: run-cluster
on:
workflow_dispatch:
inputs:
python_version:
description: Python version
type: string
required: false
default: "3.9"
cluster_profile:
description: Cluster profile
type: choice
options:
- medium-x86
- debug_xs-x86
required: false
default: medium-x86
config_command:
description: Command to produce ray config file
type: string
required: true
entrypoint:
description: Command
type: string
required: true
jobs:
run-command:
runs-on: [self-hosted, linux, x64, ci-dev]
timeout-minutes: 15 # Remove for ssh debugging
permissions:
id-token: write
contents: read
env:
RUN_MODE: ci
steps:
- name: Log workflow inputs
run: echo "${{ toJson(github.event.inputs) }}"
- name: Checkout repo
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-west-2
role-session-name: run-cluster-workflow
- name: Install uv, rust, python
uses: ./.github/actions/install
with:
python_version: ${{ inputs.python_version }}
- name: Setup uv environment
run: |
uv v
source .venv/bin/activate
uv pip install ray[default] boto3
- name: Produce ray configuration file
run: |
source .venv/bin/activate
if [ -e /tmp/ray.yaml ]; then
rm /tmp/ray.yaml
fi
echo "$(${{ inputs.config_command }})" >> /tmp/ray.yaml
cat /tmp/ray.yaml
# - name: Dynamically update ray config file
# run: |
# source .venv/bin/activate
# (cat .github/assets/template.yaml | \
# uv run \
# --python 3.12 \
# .github/ci-scripts/templatize_ray_config.py \
# --cluster-name "ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \
# --daft-wheel-url '${{ inputs.daft_wheel_url }}' \
# --daft-version '${{ inputs.daft_version }}' \
# --python-version '${{ inputs.python_version }}' \
# --cluster-profile '${{ inputs.cluster_profile }}' \
# --working-dir '${{ inputs.working_dir }}' \
# --entrypoint-script '${{ inputs.entrypoint_script }}'
# ) >> .github/assets/ray.yaml
# cat .github/assets/ray.yaml
# - name: Setup ray env vars
# run: |
# source .venv/bin/activate
# ray_env_var=$(python .github/ci-scripts/format_env_vars.py \
# --env-vars '${{ inputs.env_vars }}' \
# --enable-ray-tracing \
# )
# echo $ray_env_var
# echo "ray_env_var=$ray_env_var" >> $GITHUB_ENV
- name: Download private ssh key
run: |
KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text)
echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem
chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem
- name: Spin up ray cluster
run: |
source .venv/bin/activate
ray up /tmp/ray.yaml -y
- name: Setup connection to ray cluster
run: |
source .venv/bin/activate
ray dashboard /tmp/ray.yaml &
- name: Run the given entrypoint
run: |
source .venv/bin/activate
if [[ -z '${{ inputs.entrypoint }}' ]]; then
echo 'Invalid entrypoint submitted'
exit 1
fi
chmod +x ${{ inputs.entrypoint }}
${{ inputs.entrypoint }}
# - name: Download log files from ray cluster
# run: |
# source .venv/bin/activate
# ray rsync-down /tmp/ray.yaml /tmp/ray/session_*/logs ray-daft-logs
# find ray-daft-logs -depth -name '*:*' -exec bash -c '
# for filepath; do
# dir=$(dirname "$filepath")
# base=$(basename "$filepath")
# new_base=${base//:/_}
# mv "$filepath" "$dir/$new_base"
# done
# ' _ {} +
- name: Kill connection to ray cluster
run: |
PID=$(lsof -t -i:8265)
if [[ -n "$PID" ]]; then
echo "Process $PID is listening on port 8265; killing it..."
kill -9 "$PID"
if [[ $? -eq 0 ]]; then
echo "Process $PID killed successfully"
else
echo "Failed to kill process $PID"
fi
fi
- name: Spin down ray cluster
if: always()
run: |
source .venv/bin/activate
ray down /tmp/ray.yaml -y
# - name: Upload log files
# uses: actions/upload-artifact@v4
# with:
# name: ray-daft-logs
# path: ray-daft-logs