-
Notifications
You must be signed in to change notification settings - Fork 174
176 lines (173 loc) · 5.97 KB
/
run-cluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
name: run-cluster
on:
workflow_dispatch:
inputs:
daft_wheel_url:
description: Daft python-wheel URL
type: string
required: false
daft_version:
description: Daft version (errors if both this and "Daft python-wheel URL" are provided)
type: string
required: false
python_version:
description: Python version
type: string
required: false
default: "3.9"
cluster_profile:
description: Cluster profile
type: choice
options:
- medium-x86
- debug_xs-x86
required: false
default: medium-x86
working_dir:
description: Working directory
type: string
required: false
default: .github/working-dir
entrypoint_script:
description: Entry-point python script (must be inside of the working directory)
type: string
required: true
entrypoint_args:
description: Entry-point arguments (either a simple string or a JSON list)
type: string
required: false
default: ""
env_vars:
description: Environment variables
type: string
required: false
default: ""
jobs:
build-commit:
uses: ./.github/workflows/build-commit.yaml
if: ${{ inputs.daft_version == '' && inputs.daft_wheel_url == '' }}
with:
arch: x86
python_version: ${{ inputs.python_version }}
secrets:
ACTIONS_AWS_ROLE_ARN: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
run-command:
runs-on: [self-hosted, linux, x64, ci-dev]
# If both the `daft-version` and `daft-wheel-url` parameters are not specified, the `build-commit` job is entirely skipped.
# We still want to run this job, even if `build-commit` is skipped.
# The `always()` guarantees that this job is always run.
if: always()
permissions:
id-token: write
contents: read
needs: build-commit
steps:
- name: Log workflow inputs
run: echo "${{ toJson(github.event.inputs) }}"
- name: Checkout repo
uses: actions/checkout@v4
with:
fetch-depth: 1
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-west-2
role-session-name: run-command-workflow
- name: Install uv, rust, python
uses: ./.github/actions/install
with:
python_version: ${{ inputs.python_version }}
- name: Setup uv environment
run: |
uv v
source .venv/bin/activate
uv pip install ray[default] boto3
GHA_OUTPUT_DIR=/tmp/outputs
mkdir -p $GHA_OUTPUT_DIR
echo "Output dir is set to $GHA_OUTPUT_DIR"
echo "GHA_OUTPUT_DIR=$GHA_OUTPUT_DIR" >> $GITHUB_ENV
- name: Dynamically update ray config file
run: |
source .venv/bin/activate
(cat .github/assets/template.yaml | \
uv run \
--python 3.12 \
.github/ci-scripts/templatize_ray_config.py \
--cluster-name="ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \
--daft-wheel-url='${{ needs.build-commit.outputs.wheel_url || inputs.daft_wheel_url || '' }}' \
--daft-version='${{ inputs.daft_version }}' \
--python-version='${{ inputs.python_version }}' \
--cluster-profile='${{ inputs.cluster_profile }}' \
--working-dir='${{ inputs.working_dir }}' \
--entrypoint-script='${{ inputs.entrypoint_script }}'
) >> .github/assets/ray.yaml
cat .github/assets/ray.yaml
- name: Download private ssh key
run: |
KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text)
echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem
chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem
- name: Spin up ray cluster
run: |
source .venv/bin/activate
ray up .github/assets/ray.yaml -y
- name: Setup connection to ray cluster
run: |
source .venv/bin/activate
ray dashboard .github/assets/ray.yaml &
- name: Submit job to ray cluster
run: |
source .venv/bin/activate
if [[ -z '${{ inputs.entrypoint_script }}' ]]; then
echo 'Invalid command submitted; command cannot be empty'
exit 1
fi
echo "Output dir: $GHA_OUTPUT_DIR"
python .github/ci-scripts/job_runner.py \
--working-dir='${{ inputs.working_dir }}' \
--entrypoint-script='${{ inputs.entrypoint_script }}' \
--entrypoint-args='${{ inputs.entrypoint_args }}' \
--env-vars='${{ inputs.env_vars }}' \
--enable-ray-tracing
- name: Download log files from ray cluster
if: always()
run: |
source .venv/bin/activate
ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs ray-daft-logs
find ray-daft-logs -depth -name '*:*' -exec bash -c '
for filepath; do
dir=$(dirname "$filepath")
base=$(basename "$filepath")
new_base=${base//:/_}
mv "$filepath" "$dir/$new_base"
done
' _ {} +
- name: Kill connection to ray cluster
run: |
PID=$(lsof -t -i:8265)
if [[ -n "$PID" ]]; then
echo "Process $PID is listening on port 8265; killing it..."
kill -9 "$PID"
if [[ $? -eq 0 ]]; then
echo "Process $PID killed successfully"
else
echo "Failed to kill process $PID"
fi
fi
- name: Spin down ray cluster
if: always()
run: |
source .venv/bin/activate
ray down .github/assets/ray.yaml -y
- name: Upload output dir
if: always()
uses: actions/upload-artifact@v4
with:
name: outputs
path: ${{ env.GHA_OUTPUT_DIR }}
- name: Upload log files
if: always()
uses: actions/upload-artifact@v4
with:
name: ray-daft-logs
path: ray-daft-logs