-
Notifications
You must be signed in to change notification settings - Fork 174
146 lines (144 loc) · 4.72 KB
/
run-cluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
name: run-cluster
on:
workflow_dispatch:
inputs:
python_version:
description: Python version
type: string
required: false
default: "3.9"
cluster_profile:
description: Cluster profile
type: choice
options:
- medium-x86
- debug_xs-x86
required: false
default: medium-x86
config_command:
description: Command to produce ray config file
type: string
required: true
entrypoint:
description: Command
type: string
required: true
jobs:
run-command:
runs-on: [self-hosted, linux, x64, ci-dev]
timeout-minutes: 15 # Remove for ssh debugging
permissions:
id-token: write
contents: read
env:
RUN_MODE: ci
steps:
- name: Log workflow inputs
run: echo "${{ toJson(github.event.inputs) }}"
- name: Checkout repo
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-west-2
role-session-name: run-cluster-workflow
- name: Install uv, rust, python
uses: ./.github/actions/install
with:
python_version: ${{ inputs.python_version }}
- name: Setup uv environment
run: |
uv v
source .venv/bin/activate
uv pip install ray[default] boto3
- name: Produce ray configuration file
run: |
source .venv/bin/activate
if [ -e /tmp/ray.yaml ]; then
rm /tmp/ray.yaml
fi
echo "$(${{ inputs.config_command }})" >> /tmp/ray.yaml
cat /tmp/ray.yaml
# - name: Dynamically update ray config file
# run: |
# source .venv/bin/activate
# (cat .github/assets/template.yaml | \
# uv run \
# --python 3.12 \
# .github/ci-scripts/templatize_ray_config.py \
# --cluster-name "ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \
# --daft-wheel-url '${{ inputs.daft_wheel_url }}' \
# --daft-version '${{ inputs.daft_version }}' \
# --python-version '${{ inputs.python_version }}' \
# --cluster-profile '${{ inputs.cluster_profile }}' \
# --working-dir '${{ inputs.working_dir }}' \
# --entrypoint-script '${{ inputs.entrypoint_script }}'
# ) >> .github/assets/ray.yaml
# cat .github/assets/ray.yaml
# - name: Setup ray env vars
# run: |
# source .venv/bin/activate
# ray_env_var=$(python .github/ci-scripts/format_env_vars.py \
# --env-vars '${{ inputs.env_vars }}' \
# --enable-ray-tracing \
# )
# echo $ray_env_var
# echo "ray_env_var=$ray_env_var" >> $GITHUB_ENV
- name: Download private ssh key
run: |
KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text)
echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem
chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem
- name: Spin up ray cluster
run: |
source .venv/bin/activate
ray up /tmp/ray.yaml -y
- name: Setup connection to ray cluster
run: |
source .venv/bin/activate
ray dashboard /tmp/ray.yaml &
- name: Run the given entrypoint
run: |
source .venv/bin/activate
if [[ -z '${{ inputs.entrypoint }}' ]]; then
echo 'Invalid entrypoint submitted'
exit 1
fi
chmod +x ${{ inputs.entrypoint }}
${{ inputs.entrypoint }}
# - name: Download log files from ray cluster
# run: |
# source .venv/bin/activate
# ray rsync-down /tmp/ray.yaml /tmp/ray/session_*/logs ray-daft-logs
# find ray-daft-logs -depth -name '*:*' -exec bash -c '
# for filepath; do
# dir=$(dirname "$filepath")
# base=$(basename "$filepath")
# new_base=${base//:/_}
# mv "$filepath" "$dir/$new_base"
# done
# ' _ {} +
- name: Kill connection to ray cluster
run: |
PID=$(lsof -t -i:8265)
if [[ -n "$PID" ]]; then
echo "Process $PID is listening on port 8265; killing it..."
kill -9 "$PID"
if [[ $? -eq 0 ]]; then
echo "Process $PID killed successfully"
else
echo "Failed to kill process $PID"
fi
fi
- name: Spin down ray cluster
if: always()
run: |
source .venv/bin/activate
ray down /tmp/ray.yaml -y
# - name: Upload log files
# uses: actions/upload-artifact@v4
# with:
# name: ray-daft-logs
# path: ray-daft-logs