run-cluster #188
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: run-cluster | |
on: | |
workflow_dispatch: | |
inputs: | |
python_version: | |
description: Python version | |
type: string | |
required: false | |
default: "3.9" | |
cluster_profile: | |
description: Cluster profile | |
type: choice | |
options: | |
- medium-x86 | |
- debug_xs-x86 | |
required: false | |
default: medium-x86 | |
config_command: | |
description: Command to produce ray config file | |
type: string | |
required: true | |
entrypoint: | |
description: Command | |
type: string | |
required: true | |
jobs: | |
run-command: | |
runs-on: [self-hosted, linux, x64, ci-dev] | |
timeout-minutes: 15 # Remove for ssh debugging | |
permissions: | |
id-token: write | |
contents: read | |
env: | |
RUN_MODE: ci | |
steps: | |
- name: Log workflow inputs | |
run: echo "${{ toJson(github.event.inputs) }}" | |
- name: Checkout repo | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 1 | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
aws-region: us-west-2 | |
role-session-name: run-cluster-workflow | |
- name: Install uv, rust, python | |
uses: ./.github/actions/install | |
with: | |
python_version: ${{ inputs.python_version }} | |
- name: Setup uv environment | |
run: | | |
uv v | |
source .venv/bin/activate | |
uv pip install ray[default] boto3 | |
- name: Produce ray configuration file | |
run: | | |
source .venv/bin/activate | |
if [ -e /tmp/ray.yaml ]; then | |
rm /tmp/ray.yaml | |
fi | |
echo "$(${{ inputs.config_command }})" >> /tmp/ray.yaml | |
cat /tmp/ray.yaml | |
# - name: Dynamically update ray config file | |
# run: | | |
# source .venv/bin/activate | |
# (cat .github/assets/template.yaml | \ | |
# uv run \ | |
# --python 3.12 \ | |
# .github/ci-scripts/templatize_ray_config.py \ | |
# --cluster-name "ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \ | |
# --daft-wheel-url '${{ inputs.daft_wheel_url }}' \ | |
# --daft-version '${{ inputs.daft_version }}' \ | |
# --python-version '${{ inputs.python_version }}' \ | |
# --cluster-profile '${{ inputs.cluster_profile }}' \ | |
# --working-dir '${{ inputs.working_dir }}' \ | |
# --entrypoint-script '${{ inputs.entrypoint_script }}' | |
# ) >> .github/assets/ray.yaml | |
# cat .github/assets/ray.yaml | |
# - name: Setup ray env vars | |
# run: | | |
# source .venv/bin/activate | |
# ray_env_var=$(python .github/ci-scripts/format_env_vars.py \ | |
# --env-vars '${{ inputs.env_vars }}' \ | |
# --enable-ray-tracing \ | |
# ) | |
# echo $ray_env_var | |
# echo "ray_env_var=$ray_env_var" >> $GITHUB_ENV | |
- name: Download private ssh key | |
run: | | |
KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text) | |
echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem | |
chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem | |
- name: Spin up ray cluster | |
run: | | |
source .venv/bin/activate | |
ray up /tmp/ray.yaml -y | |
- name: Setup connection to ray cluster | |
run: | | |
source .venv/bin/activate | |
ray dashboard /tmp/ray.yaml & | |
- name: Run the given entrypoint | |
run: | | |
source .venv/bin/activate | |
if [[ -z '${{ inputs.entrypoint }}' ]]; then | |
echo 'Invalid entrypoint submitted' | |
exit 1 | |
fi | |
chmod +x ${{ inputs.entrypoint }} | |
${{ inputs.entrypoint }} | |
# - name: Download log files from ray cluster | |
# run: | | |
# source .venv/bin/activate | |
# ray rsync-down /tmp/ray.yaml /tmp/ray/session_*/logs ray-daft-logs | |
# find ray-daft-logs -depth -name '*:*' -exec bash -c ' | |
# for filepath; do | |
# dir=$(dirname "$filepath") | |
# base=$(basename "$filepath") | |
# new_base=${base//:/_} | |
# mv "$filepath" "$dir/$new_base" | |
# done | |
# ' _ {} + | |
- name: Kill connection to ray cluster | |
run: | | |
PID=$(lsof -t -i:8265) | |
if [[ -n "$PID" ]]; then | |
echo "Process $PID is listening on port 8265; killing it..." | |
kill -9 "$PID" | |
if [[ $? -eq 0 ]]; then | |
echo "Process $PID killed successfully" | |
else | |
echo "Failed to kill process $PID" | |
fi | |
fi | |
- name: Spin down ray cluster | |
if: always() | |
run: | | |
source .venv/bin/activate | |
ray down /tmp/ray.yaml -y | |
# - name: Upload log files | |
# uses: actions/upload-artifact@v4 | |
# with: | |
# name: ray-daft-logs | |
# path: ray-daft-logs |