Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] Add cluster profiles #3426

Merged
merged 10 commits into from
Dec 1, 2024
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
cluster_name: '{{RAY_CLUSTER_NAME}}'
# Note:
# GitHub Actions workflow will replace all parameters between `{{...}}` with the
# actual values as determined dynamically during runtime of the actual workflow.

cluster_name: \{{CLUSTER_NAME}}
raunakab marked this conversation as resolved.
Show resolved Hide resolved

provider:
type: aws
Expand All @@ -8,51 +12,40 @@ provider:
GroupName: ray-autoscaler-c1

auth:
ssh_user: ubuntu
ssh_user: \{{CLUSTER_PROFILE/ssh_user}}
ssh_private_key: ~/.ssh/ci-github-actions-ray-cluster-key.pem

max_workers: 2
max_workers: \{{CLUSTER_PROFILE/node_count}}
available_node_types:
ray.head.default:
resources: {"CPU": 0}
node_config:
KeyName: ci-github-actions-ray-cluster-key
InstanceType: i3.2xlarge
ImageId: ami-04dd23e62ed049936
InstanceType: \{{CLUSTER_PROFILE/instance_type}}
ImageId: \{{CLUSTER_PROFILE/image_id}}
IamInstanceProfile:
Name: ray-autoscaler-v1

ray.worker.default:
min_workers: 2
max_workers: 2
min_workers: \{{CLUSTER_PROFILE/node_count}}
max_workers: \{{CLUSTER_PROFILE/node_count}}
resources: {}
node_config:
KeyName: ci-github-actions-ray-cluster-key
InstanceType: i3.2xlarge
ImageId: ami-04dd23e62ed049936
InstanceType: \{{CLUSTER_PROFILE/instance_type}}
ImageId: \{{CLUSTER_PROFILE/image_id}}
IamInstanceProfile:
Name: ray-autoscaler-v1

setup_commands:
# Mount drive
- |
findmnt /tmp 1> /dev/null
code=$?
if [ $code -ne 0 ]; then
sudo mkfs.ext4 /dev/nvme0n1
sudo mount -t ext4 /dev/nvme0n1 /tmp
sudo chmod 777 /tmp
fi
# Install dependencies
# GitHub Actions workflow will replace all parameters between `{{...}}` with the
# actual values as determined dynamically during runtime of the actual workflow.
- \{{CLUSTER_PROFILE/volume_mount}}
- sudo snap install aws-cli --classic
- curl -LsSf https://astral.sh/uv/install.sh | sh
- echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc
- source ~/.bashrc
- uv python install {{PYTHON_VERSION}}
- uv python pin {{PYTHON_VERSION}}
- uv python install \{{PYTHON_VERSION}}
- uv python pin \{{PYTHON_VERSION}}
- uv v
- echo "source $HOME/.venv/bin/activate" >> $HOME/.bashrc
- source .venv/bin/activate
- uv pip install pip ray[default] py-spy getdaft{{DAFT_VERSION}}
- uv pip install pip ray[default] py-spy getdaft\{{DAFT_VERSION}}
89 changes: 89 additions & 0 deletions .github/ci-scripts/templatize_ray_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import sys
from argparse import ArgumentParser
from dataclasses import dataclass
from typing import Optional

CLUSTER_NAME_PLACEHOLDER = "\\{{CLUSTER_NAME}}"
DAFT_VERSION_PLACEHOLDER = "\\{{DAFT_VERSION}}"
PYTHON_VERSION_PLACEHOLDER = "\\{{PYTHON_VERSION}}"
CLUSTER_PROFILE__NODE_COUNT = "\\{{CLUSTER_PROFILE/node_count}}"
CLUSTER_PROFILE__INSTANCE_TYPE = "\\{{CLUSTER_PROFILE/instance_type}}"
CLUSTER_PROFILE__IMAGE_ID = "\\{{CLUSTER_PROFILE/image_id}}"
CLUSTER_PROFILE__SSH_USER = "\\{{CLUSTER_PROFILE/ssh_user}}"
CLUSTER_PROFILE__VOLUME_MOUNT = "\\{{CLUSTER_PROFILE/volume_mount}}"


@dataclass
class Profile:
node_count: int
instance_type: str
image_id: str
ssh_user: str
volume_mount: Optional[str] = None


profiles: dict[str, Optional[Profile]] = {
"debug_xs-x86": Profile(
instance_type="t3.large",
image_id="ami-04dd23e62ed049936",
node_count=1,
ssh_user="ubuntu",
),
"medium-x86": Profile(
instance_type="i3.2xlarge",
image_id="ami-04dd23e62ed049936",
node_count=4,
ssh_user="ubuntu",
volume_mount=""" |
findmnt /tmp 1> /dev/null
code=$?
if [ $code -ne 0 ]; then
sudo mkfs.ext4 /dev/nvme0n1
sudo mount -t ext4 /dev/nvme0n1 /tmp
sudo chmod 777 /tmp
fi""",
),
}


if __name__ == "__main__":
content = sys.stdin.read()

parser = ArgumentParser()
parser.add_argument("--cluster-name")
parser.add_argument("--daft-version")
parser.add_argument("--python-version")
parser.add_argument("--cluster-profile")
args = parser.parse_args()

if args.cluster_name:
content = content.replace(CLUSTER_NAME_PLACEHOLDER, args.cluster_name)

if args.daft_version:
content = content.replace(DAFT_VERSION_PLACEHOLDER, f"=={args.daft_version}")
else:
content = content.replace(DAFT_VERSION_PLACEHOLDER, "")

if args.python_version:
content = content.replace(PYTHON_VERSION_PLACEHOLDER, args.python_version)

if cluster_profile := args.cluster_profile:
cluster_profile: str
if cluster_profile not in profiles:
raise Exception(f'Cluster profile "{cluster_profile}" not found')

profile = profiles[cluster_profile]
if profile is None:
raise Exception(f'Cluster profile "{cluster_profile}" not yet implemented')

assert profile is not None
content = content.replace(CLUSTER_PROFILE__NODE_COUNT, str(profile.node_count))
content = content.replace(CLUSTER_PROFILE__INSTANCE_TYPE, profile.instance_type)
content = content.replace(CLUSTER_PROFILE__IMAGE_ID, profile.image_id)
content = content.replace(CLUSTER_PROFILE__SSH_USER, profile.ssh_user)
if profile.volume_mount:
content = content.replace(CLUSTER_PROFILE__VOLUME_MOUNT, profile.volume_mount)
else:
content = content.replace(CLUSTER_PROFILE__VOLUME_MOUNT, "echo 'Nothing to mount; skipping'")

print(content)
33 changes: 21 additions & 12 deletions .github/workflows/run-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ on:
description: The version of python to use
required: false
default: "3.9"
cluster_profile:
type: choice
options:
- medium-x86
- debug_xs-x86
description: The profile to use for the cluster
required: false
default: medium-x86
command:
type: string
description: The command to run on the cluster
Expand Down Expand Up @@ -50,14 +58,15 @@ jobs:
uv pip install ray[default] boto3
- name: Dynamically update ray config file
run: |
id="github-ci-${{ github.run_id }}_${{ github.run_attempt }}"
sed -i "s|{{RAY_CLUSTER_NAME}}|$id|g" .github/assets/benchmarking_ray_config.yaml
sed -i 's|{{PYTHON_VERSION}}|${{ inputs.python_version }}|g' .github/assets/benchmarking_ray_config.yaml
if [[ '${{ inputs.daft_version }}' ]]; then
sed -i 's|{{DAFT_VERSION}}|==${{ inputs.daft_version }}|g' .github/assets/benchmarking_ray_config.yaml
else
sed -i 's|{{DAFT_VERSION}}||g' .github/assets/benchmarking_ray_config.yaml
fi
source .venv/bin/activate
(cat .github/assets/.template.yaml \
| python .github/ci-scripts/templatize_ray_config.py \
--cluster-name "ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \
--daft-version '${{ inputs.daft_version }}' \
--python-version '${{ inputs.python_version }}' \
--cluster-profile '${{ inputs.cluster_profile }}'
) >> .github/assets/ray.yaml
cat .github/assets/ray.yaml
- name: Download private ssh key
run: |
KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text)
Expand All @@ -66,11 +75,11 @@ jobs:
- name: Spin up ray cluster
run: |
source .venv/bin/activate
ray up .github/assets/benchmarking_ray_config.yaml -y
ray up .github/assets/ray.yaml -y
- name: Setup connection to ray cluster
run: |
source .venv/bin/activate
ray dashboard .github/assets/benchmarking_ray_config.yaml &
ray dashboard .github/assets/ray.yaml &
- name: Submit job to ray cluster
run: |
source .venv/bin/activate
Expand All @@ -86,7 +95,7 @@ jobs:
- name: Download log files from ray cluster
run: |
source .venv/bin/activate
ray rsync-down .github/assets/benchmarking_ray_config.yaml /tmp/ray/session_*/logs ray-daft-logs
ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs ray-daft-logs
find ray-daft-logs -depth -name '*:*' -exec bash -c '
for filepath; do
dir=$(dirname "$filepath")
Expand All @@ -111,7 +120,7 @@ jobs:
if: always()
run: |
source .venv/bin/activate
ray down .github/assets/benchmarking_ray_config.yaml -y
ray down .github/assets/ray.yaml -y
- name: Upload log files
uses: actions/upload-artifact@v4
with:
Expand Down
Loading