Skip to content

Commit

Permalink
[FEAT] Add cluster profiles (#3426)
Browse files Browse the repository at this point in the history
# Overview
- When running a job on a ray-cluster using GHA, we want to be able to
configure the configuration of the cluster.
  - this is achieved via "cluster profiles"
- essentially, this is a set number of configurations (e.g.,
`medium-x86`, `debug_xs-x86`, etc.) that end-users can select from
- this will take care of all of the configurations without leaking the
internals of the ray-configuration story

## Available Options
- `medium-x86`
- `debug_xs-x86`

I will plan on adding more in the future. For now, this should suffice.
  • Loading branch information
Raunak Bhagat authored Dec 1, 2024
1 parent 8652eba commit b5f60e0
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 36 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
cluster_name: '{{RAY_CLUSTER_NAME}}'
# Note:
# GitHub Actions workflow will replace all parameters between `{{...}}` with the
# actual values as determined dynamically during runtime of the actual workflow.

cluster_name: \{{CLUSTER_NAME}}

provider:
type: aws
Expand All @@ -8,51 +12,40 @@ provider:
GroupName: ray-autoscaler-c1

auth:
ssh_user: ubuntu
ssh_user: \{{CLUSTER_PROFILE/ssh_user}}
ssh_private_key: ~/.ssh/ci-github-actions-ray-cluster-key.pem

max_workers: 2
max_workers: \{{CLUSTER_PROFILE/node_count}}
available_node_types:
ray.head.default:
resources: {"CPU": 0}
node_config:
KeyName: ci-github-actions-ray-cluster-key
InstanceType: i3.2xlarge
ImageId: ami-04dd23e62ed049936
InstanceType: \{{CLUSTER_PROFILE/instance_type}}
ImageId: \{{CLUSTER_PROFILE/image_id}}
IamInstanceProfile:
Name: ray-autoscaler-v1

ray.worker.default:
min_workers: 2
max_workers: 2
min_workers: \{{CLUSTER_PROFILE/node_count}}
max_workers: \{{CLUSTER_PROFILE/node_count}}
resources: {}
node_config:
KeyName: ci-github-actions-ray-cluster-key
InstanceType: i3.2xlarge
ImageId: ami-04dd23e62ed049936
InstanceType: \{{CLUSTER_PROFILE/instance_type}}
ImageId: \{{CLUSTER_PROFILE/image_id}}
IamInstanceProfile:
Name: ray-autoscaler-v1

setup_commands:
# Mount drive
- |
findmnt /tmp 1> /dev/null
code=$?
if [ $code -ne 0 ]; then
sudo mkfs.ext4 /dev/nvme0n1
sudo mount -t ext4 /dev/nvme0n1 /tmp
sudo chmod 777 /tmp
fi
# Install dependencies
# GitHub Actions workflow will replace all parameters between `{{...}}` with the
# actual values as determined dynamically during runtime of the actual workflow.
- \{{CLUSTER_PROFILE/volume_mount}}
- sudo snap install aws-cli --classic
- curl -LsSf https://astral.sh/uv/install.sh | sh
- echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc
- source ~/.bashrc
- uv python install {{PYTHON_VERSION}}
- uv python pin {{PYTHON_VERSION}}
- uv python install \{{PYTHON_VERSION}}
- uv python pin \{{PYTHON_VERSION}}
- uv v
- echo "source $HOME/.venv/bin/activate" >> $HOME/.bashrc
- source .venv/bin/activate
- uv pip install pip ray[default] py-spy getdaft{{DAFT_VERSION}}
- uv pip install pip ray[default] py-spy getdaft\{{DAFT_VERSION}}
89 changes: 89 additions & 0 deletions .github/ci-scripts/templatize_ray_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import sys
from argparse import ArgumentParser
from dataclasses import dataclass
from typing import Optional

CLUSTER_NAME_PLACEHOLDER = "\\{{CLUSTER_NAME}}"
DAFT_VERSION_PLACEHOLDER = "\\{{DAFT_VERSION}}"
PYTHON_VERSION_PLACEHOLDER = "\\{{PYTHON_VERSION}}"
CLUSTER_PROFILE__NODE_COUNT = "\\{{CLUSTER_PROFILE/node_count}}"
CLUSTER_PROFILE__INSTANCE_TYPE = "\\{{CLUSTER_PROFILE/instance_type}}"
CLUSTER_PROFILE__IMAGE_ID = "\\{{CLUSTER_PROFILE/image_id}}"
CLUSTER_PROFILE__SSH_USER = "\\{{CLUSTER_PROFILE/ssh_user}}"
CLUSTER_PROFILE__VOLUME_MOUNT = "\\{{CLUSTER_PROFILE/volume_mount}}"


@dataclass
class Profile:
node_count: int
instance_type: str
image_id: str
ssh_user: str
volume_mount: Optional[str] = None


profiles: dict[str, Optional[Profile]] = {
"debug_xs-x86": Profile(
instance_type="t3.large",
image_id="ami-04dd23e62ed049936",
node_count=1,
ssh_user="ubuntu",
),
"medium-x86": Profile(
instance_type="i3.2xlarge",
image_id="ami-04dd23e62ed049936",
node_count=4,
ssh_user="ubuntu",
volume_mount=""" |
findmnt /tmp 1> /dev/null
code=$?
if [ $code -ne 0 ]; then
sudo mkfs.ext4 /dev/nvme0n1
sudo mount -t ext4 /dev/nvme0n1 /tmp
sudo chmod 777 /tmp
fi""",
),
}


if __name__ == "__main__":
content = sys.stdin.read()

parser = ArgumentParser()
parser.add_argument("--cluster-name")
parser.add_argument("--daft-version")
parser.add_argument("--python-version")
parser.add_argument("--cluster-profile")
args = parser.parse_args()

if args.cluster_name:
content = content.replace(CLUSTER_NAME_PLACEHOLDER, args.cluster_name)

if args.daft_version:
content = content.replace(DAFT_VERSION_PLACEHOLDER, f"=={args.daft_version}")
else:
content = content.replace(DAFT_VERSION_PLACEHOLDER, "")

if args.python_version:
content = content.replace(PYTHON_VERSION_PLACEHOLDER, args.python_version)

if cluster_profile := args.cluster_profile:
cluster_profile: str
if cluster_profile not in profiles:
raise Exception(f'Cluster profile "{cluster_profile}" not found')

profile = profiles[cluster_profile]
if profile is None:
raise Exception(f'Cluster profile "{cluster_profile}" not yet implemented')

assert profile is not None
content = content.replace(CLUSTER_PROFILE__NODE_COUNT, str(profile.node_count))
content = content.replace(CLUSTER_PROFILE__INSTANCE_TYPE, profile.instance_type)
content = content.replace(CLUSTER_PROFILE__IMAGE_ID, profile.image_id)
content = content.replace(CLUSTER_PROFILE__SSH_USER, profile.ssh_user)
if profile.volume_mount:
content = content.replace(CLUSTER_PROFILE__VOLUME_MOUNT, profile.volume_mount)
else:
content = content.replace(CLUSTER_PROFILE__VOLUME_MOUNT, "echo 'Nothing to mount; skipping'")

print(content)
33 changes: 21 additions & 12 deletions .github/workflows/run-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ on:
description: The version of python to use
required: false
default: "3.9"
cluster_profile:
type: choice
options:
- medium-x86
- debug_xs-x86
description: The profile to use for the cluster
required: false
default: medium-x86
command:
type: string
description: The command to run on the cluster
Expand Down Expand Up @@ -50,14 +58,15 @@ jobs:
uv pip install ray[default] boto3
- name: Dynamically update ray config file
run: |
id="github-ci-${{ github.run_id }}_${{ github.run_attempt }}"
sed -i "s|{{RAY_CLUSTER_NAME}}|$id|g" .github/assets/benchmarking_ray_config.yaml
sed -i 's|{{PYTHON_VERSION}}|${{ inputs.python_version }}|g' .github/assets/benchmarking_ray_config.yaml
if [[ '${{ inputs.daft_version }}' ]]; then
sed -i 's|{{DAFT_VERSION}}|==${{ inputs.daft_version }}|g' .github/assets/benchmarking_ray_config.yaml
else
sed -i 's|{{DAFT_VERSION}}||g' .github/assets/benchmarking_ray_config.yaml
fi
source .venv/bin/activate
(cat .github/assets/.template.yaml \
| python .github/ci-scripts/templatize_ray_config.py \
--cluster-name "ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \
--daft-version '${{ inputs.daft_version }}' \
--python-version '${{ inputs.python_version }}' \
--cluster-profile '${{ inputs.cluster_profile }}'
) >> .github/assets/ray.yaml
cat .github/assets/ray.yaml
- name: Download private ssh key
run: |
KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text)
Expand All @@ -66,11 +75,11 @@ jobs:
- name: Spin up ray cluster
run: |
source .venv/bin/activate
ray up .github/assets/benchmarking_ray_config.yaml -y
ray up .github/assets/ray.yaml -y
- name: Setup connection to ray cluster
run: |
source .venv/bin/activate
ray dashboard .github/assets/benchmarking_ray_config.yaml &
ray dashboard .github/assets/ray.yaml &
- name: Submit job to ray cluster
run: |
source .venv/bin/activate
Expand All @@ -86,7 +95,7 @@ jobs:
- name: Download log files from ray cluster
run: |
source .venv/bin/activate
ray rsync-down .github/assets/benchmarking_ray_config.yaml /tmp/ray/session_*/logs ray-daft-logs
ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs ray-daft-logs
find ray-daft-logs -depth -name '*:*' -exec bash -c '
for filepath; do
dir=$(dirname "$filepath")
Expand All @@ -111,7 +120,7 @@ jobs:
if: always()
run: |
source .venv/bin/activate
ray down .github/assets/benchmarking_ray_config.yaml -y
ray down .github/assets/ray.yaml -y
- name: Upload log files
uses: actions/upload-artifact@v4
with:
Expand Down

0 comments on commit b5f60e0

Please sign in to comment.