From b5f60e000a0aebf1a3eb6633905c7ccdab96c6a4 Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Sun, 1 Dec 2024 11:00:55 -0800 Subject: [PATCH] [FEAT] Add cluster profiles (#3426) # Overview - When running a job on a ray-cluster using GHA, we want to be able to configure the configuration of the cluster. - this is achieved via "cluster profiles" - essentially, this is a set number of configurations (e.g., `medium-x86`, `debug_xs-x86`, etc.) that end-users can select from - this will take care of all of the configurations without leaking the internals of the ray-configuration story ## Available Options - `medium-x86` - `debug_xs-x86` I will plan on adding more in the future. For now, this should suffice. --- ...marking_ray_config.yaml => .template.yaml} | 41 ++++----- .github/ci-scripts/templatize_ray_config.py | 89 +++++++++++++++++++ .github/workflows/run-cluster.yaml | 33 ++++--- 3 files changed, 127 insertions(+), 36 deletions(-) rename .github/assets/{benchmarking_ray_config.yaml => .template.yaml} (62%) create mode 100644 .github/ci-scripts/templatize_ray_config.py diff --git a/.github/assets/benchmarking_ray_config.yaml b/.github/assets/.template.yaml similarity index 62% rename from .github/assets/benchmarking_ray_config.yaml rename to .github/assets/.template.yaml index 7322cd8ef9..4457633671 100644 --- a/.github/assets/benchmarking_ray_config.yaml +++ b/.github/assets/.template.yaml @@ -1,4 +1,8 @@ -cluster_name: '{{RAY_CLUSTER_NAME}}' +# Note: +# GitHub Actions workflow will replace all parameters between `{{...}}` with the +# actual values as determined dynamically during runtime of the actual workflow. + +cluster_name: \{{CLUSTER_NAME}} provider: type: aws @@ -8,51 +12,40 @@ provider: GroupName: ray-autoscaler-c1 auth: - ssh_user: ubuntu + ssh_user: \{{CLUSTER_PROFILE/ssh_user}} ssh_private_key: ~/.ssh/ci-github-actions-ray-cluster-key.pem -max_workers: 2 +max_workers: \{{CLUSTER_PROFILE/node_count}} available_node_types: ray.head.default: resources: {"CPU": 0} node_config: KeyName: ci-github-actions-ray-cluster-key - InstanceType: i3.2xlarge - ImageId: ami-04dd23e62ed049936 + InstanceType: \{{CLUSTER_PROFILE/instance_type}} + ImageId: \{{CLUSTER_PROFILE/image_id}} IamInstanceProfile: Name: ray-autoscaler-v1 ray.worker.default: - min_workers: 2 - max_workers: 2 + min_workers: \{{CLUSTER_PROFILE/node_count}} + max_workers: \{{CLUSTER_PROFILE/node_count}} resources: {} node_config: KeyName: ci-github-actions-ray-cluster-key - InstanceType: i3.2xlarge - ImageId: ami-04dd23e62ed049936 + InstanceType: \{{CLUSTER_PROFILE/instance_type}} + ImageId: \{{CLUSTER_PROFILE/image_id}} IamInstanceProfile: Name: ray-autoscaler-v1 setup_commands: -# Mount drive -- | - findmnt /tmp 1> /dev/null - code=$? - if [ $code -ne 0 ]; then - sudo mkfs.ext4 /dev/nvme0n1 - sudo mount -t ext4 /dev/nvme0n1 /tmp - sudo chmod 777 /tmp - fi -# Install dependencies -# GitHub Actions workflow will replace all parameters between `{{...}}` with the -# actual values as determined dynamically during runtime of the actual workflow. +- \{{CLUSTER_PROFILE/volume_mount}} - sudo snap install aws-cli --classic - curl -LsSf https://astral.sh/uv/install.sh | sh - echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc - source ~/.bashrc -- uv python install {{PYTHON_VERSION}} -- uv python pin {{PYTHON_VERSION}} +- uv python install \{{PYTHON_VERSION}} +- uv python pin \{{PYTHON_VERSION}} - uv v - echo "source $HOME/.venv/bin/activate" >> $HOME/.bashrc - source .venv/bin/activate -- uv pip install pip ray[default] py-spy getdaft{{DAFT_VERSION}} +- uv pip install pip ray[default] py-spy getdaft\{{DAFT_VERSION}} diff --git a/.github/ci-scripts/templatize_ray_config.py b/.github/ci-scripts/templatize_ray_config.py new file mode 100644 index 0000000000..c871a91419 --- /dev/null +++ b/.github/ci-scripts/templatize_ray_config.py @@ -0,0 +1,89 @@ +import sys +from argparse import ArgumentParser +from dataclasses import dataclass +from typing import Optional + +CLUSTER_NAME_PLACEHOLDER = "\\{{CLUSTER_NAME}}" +DAFT_VERSION_PLACEHOLDER = "\\{{DAFT_VERSION}}" +PYTHON_VERSION_PLACEHOLDER = "\\{{PYTHON_VERSION}}" +CLUSTER_PROFILE__NODE_COUNT = "\\{{CLUSTER_PROFILE/node_count}}" +CLUSTER_PROFILE__INSTANCE_TYPE = "\\{{CLUSTER_PROFILE/instance_type}}" +CLUSTER_PROFILE__IMAGE_ID = "\\{{CLUSTER_PROFILE/image_id}}" +CLUSTER_PROFILE__SSH_USER = "\\{{CLUSTER_PROFILE/ssh_user}}" +CLUSTER_PROFILE__VOLUME_MOUNT = "\\{{CLUSTER_PROFILE/volume_mount}}" + + +@dataclass +class Profile: + node_count: int + instance_type: str + image_id: str + ssh_user: str + volume_mount: Optional[str] = None + + +profiles: dict[str, Optional[Profile]] = { + "debug_xs-x86": Profile( + instance_type="t3.large", + image_id="ami-04dd23e62ed049936", + node_count=1, + ssh_user="ubuntu", + ), + "medium-x86": Profile( + instance_type="i3.2xlarge", + image_id="ami-04dd23e62ed049936", + node_count=4, + ssh_user="ubuntu", + volume_mount=""" | + findmnt /tmp 1> /dev/null + code=$? + if [ $code -ne 0 ]; then + sudo mkfs.ext4 /dev/nvme0n1 + sudo mount -t ext4 /dev/nvme0n1 /tmp + sudo chmod 777 /tmp + fi""", + ), +} + + +if __name__ == "__main__": + content = sys.stdin.read() + + parser = ArgumentParser() + parser.add_argument("--cluster-name") + parser.add_argument("--daft-version") + parser.add_argument("--python-version") + parser.add_argument("--cluster-profile") + args = parser.parse_args() + + if args.cluster_name: + content = content.replace(CLUSTER_NAME_PLACEHOLDER, args.cluster_name) + + if args.daft_version: + content = content.replace(DAFT_VERSION_PLACEHOLDER, f"=={args.daft_version}") + else: + content = content.replace(DAFT_VERSION_PLACEHOLDER, "") + + if args.python_version: + content = content.replace(PYTHON_VERSION_PLACEHOLDER, args.python_version) + + if cluster_profile := args.cluster_profile: + cluster_profile: str + if cluster_profile not in profiles: + raise Exception(f'Cluster profile "{cluster_profile}" not found') + + profile = profiles[cluster_profile] + if profile is None: + raise Exception(f'Cluster profile "{cluster_profile}" not yet implemented') + + assert profile is not None + content = content.replace(CLUSTER_PROFILE__NODE_COUNT, str(profile.node_count)) + content = content.replace(CLUSTER_PROFILE__INSTANCE_TYPE, profile.instance_type) + content = content.replace(CLUSTER_PROFILE__IMAGE_ID, profile.image_id) + content = content.replace(CLUSTER_PROFILE__SSH_USER, profile.ssh_user) + if profile.volume_mount: + content = content.replace(CLUSTER_PROFILE__VOLUME_MOUNT, profile.volume_mount) + else: + content = content.replace(CLUSTER_PROFILE__VOLUME_MOUNT, "echo 'Nothing to mount; skipping'") + + print(content) diff --git a/.github/workflows/run-cluster.yaml b/.github/workflows/run-cluster.yaml index 55c26d0a55..8c403f41f9 100644 --- a/.github/workflows/run-cluster.yaml +++ b/.github/workflows/run-cluster.yaml @@ -12,6 +12,14 @@ on: description: The version of python to use required: false default: "3.9" + cluster_profile: + type: choice + options: + - medium-x86 + - debug_xs-x86 + description: The profile to use for the cluster + required: false + default: medium-x86 command: type: string description: The command to run on the cluster @@ -50,14 +58,15 @@ jobs: uv pip install ray[default] boto3 - name: Dynamically update ray config file run: | - id="github-ci-${{ github.run_id }}_${{ github.run_attempt }}" - sed -i "s|{{RAY_CLUSTER_NAME}}|$id|g" .github/assets/benchmarking_ray_config.yaml - sed -i 's|{{PYTHON_VERSION}}|${{ inputs.python_version }}|g' .github/assets/benchmarking_ray_config.yaml - if [[ '${{ inputs.daft_version }}' ]]; then - sed -i 's|{{DAFT_VERSION}}|==${{ inputs.daft_version }}|g' .github/assets/benchmarking_ray_config.yaml - else - sed -i 's|{{DAFT_VERSION}}||g' .github/assets/benchmarking_ray_config.yaml - fi + source .venv/bin/activate + (cat .github/assets/.template.yaml \ + | python .github/ci-scripts/templatize_ray_config.py \ + --cluster-name "ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \ + --daft-version '${{ inputs.daft_version }}' \ + --python-version '${{ inputs.python_version }}' \ + --cluster-profile '${{ inputs.cluster_profile }}' + ) >> .github/assets/ray.yaml + cat .github/assets/ray.yaml - name: Download private ssh key run: | KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text) @@ -66,11 +75,11 @@ jobs: - name: Spin up ray cluster run: | source .venv/bin/activate - ray up .github/assets/benchmarking_ray_config.yaml -y + ray up .github/assets/ray.yaml -y - name: Setup connection to ray cluster run: | source .venv/bin/activate - ray dashboard .github/assets/benchmarking_ray_config.yaml & + ray dashboard .github/assets/ray.yaml & - name: Submit job to ray cluster run: | source .venv/bin/activate @@ -86,7 +95,7 @@ jobs: - name: Download log files from ray cluster run: | source .venv/bin/activate - ray rsync-down .github/assets/benchmarking_ray_config.yaml /tmp/ray/session_*/logs ray-daft-logs + ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs ray-daft-logs find ray-daft-logs -depth -name '*:*' -exec bash -c ' for filepath; do dir=$(dirname "$filepath") @@ -111,7 +120,7 @@ jobs: if: always() run: | source .venv/bin/activate - ray down .github/assets/benchmarking_ray_config.yaml -y + ray down .github/assets/ray.yaml -y - name: Upload log files uses: actions/upload-artifact@v4 with: