From 66b33f17e0903af27c2c4f1d5273a875f4f63446 Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Mon, 25 Nov 2024 14:51:17 -0800 Subject: [PATCH 1/8] Change name to template.yaml - makes more sense - this is a template file which will be dynamically string-replaced at runtime by the GHA --- ...hmarking_ray_config.yaml => template.yaml} | 33 ++++++++----------- 1 file changed, 13 insertions(+), 20 deletions(-) rename .github/assets/{benchmarking_ray_config.yaml => template.yaml} (72%) diff --git a/.github/assets/benchmarking_ray_config.yaml b/.github/assets/template.yaml similarity index 72% rename from .github/assets/benchmarking_ray_config.yaml rename to .github/assets/template.yaml index 8e098c8b19..f1406befaf 100644 --- a/.github/assets/benchmarking_ray_config.yaml +++ b/.github/assets/template.yaml @@ -1,4 +1,8 @@ -cluster_name: '{{RAY_CLUSTER_NAME}}' +# Note: +# GitHub Actions workflow will replace all parameters between `{{...}}` with the +# actual values as determined dynamically during runtime of the actual workflow. + +cluster_name: '{{CLUSTER_NAME}}' provider: type: aws @@ -11,41 +15,30 @@ auth: ssh_user: ubuntu ssh_private_key: ~/.ssh/ci-github-actions-ray-cluster-key.pem -max_workers: 2 +max_workers: '{{CLUSTER_PROFILE/node_count}}' available_node_types: ray.head.default: resources: {"CPU": 0} node_config: KeyName: ci-github-actions-ray-cluster-key - InstanceType: i3.2xlarge - ImageId: ami-04dd23e62ed049936 + InstanceType: '{{CLUSTER_PROFILE/instance_type}}' + ImageId: '{{CLUSTER_PROFILE/image_id}}' IamInstanceProfile: Name: ray-autoscaler-v1 ray.worker.default: - min_workers: 2 - max_workers: 2 + min_workers: '{{CLUSTER_PROFILE/node_count}}' + max_workers: '{{CLUSTER_PROFILE/node_count}}' resources: {} node_config: KeyName: ci-github-actions-ray-cluster-key - InstanceType: i3.2xlarge - ImageId: ami-04dd23e62ed049936 + InstanceType: '{{CLUSTER_PROFILE/instance_type}}' + ImageId: '{{CLUSTER_PROFILE/image_id}}' IamInstanceProfile: Name: ray-autoscaler-v1 setup_commands: -# Mount drive -- | - findmnt /tmp 1> /dev/null - code=$? - if [ $code -ne 0 ]; then - sudo mkfs.ext4 /dev/nvme0n1 - sudo mount -t ext4 /dev/nvme0n1 /tmp - sudo chmod 777 /tmp - fi -# Install dependencies -# GitHub Actions workflow will replace all parameters between `{{...}}` with the -# actual values as determined dynamically during runtime of the actual workflow. +- '{{CLUSTER_PROFILE/volume_mount}}' - sudo snap install aws-cli --classic - curl -LsSf https://astral.sh/uv/install.sh | sh - echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc From c98a284a1981d6a4cdffe37c426fe701168d2355 Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Mon, 25 Nov 2024 14:52:21 -0800 Subject: [PATCH 2/8] Add script to templatize ray config file during runtime --- .github/ci-scripts/templatize_ray_config.py | 80 +++++++++++++++++++++ .github/workflows/run-cluster.yaml | 33 +++++---- 2 files changed, 101 insertions(+), 12 deletions(-) create mode 100644 .github/ci-scripts/templatize_ray_config.py diff --git a/.github/ci-scripts/templatize_ray_config.py b/.github/ci-scripts/templatize_ray_config.py new file mode 100644 index 0000000000..ea597c1dd1 --- /dev/null +++ b/.github/ci-scripts/templatize_ray_config.py @@ -0,0 +1,80 @@ +import sys +from argparse import ArgumentParser +from dataclasses import dataclass +from typing import Optional + +CLUSTER_NAME_PLACEHOLDER = "{{CLUSTER_NAME}}" +DAFT_VERSION_PLACEHOLDER = "{{DAFT_VERSION}}" +PYTHON_VERSION_PLACEHOLDER = "{{PYTHON_VERSION}}" +CLUSTER_PROFILE__NODE_COUNT = "'{{CLUSTER_PROFILE/node_count}}'" +CLUSTER_PROFILE__INSTANCE_TYPE = "{{CLUSTER_PROFILE/instance_type}}" +CLUSTER_PROFILE__IMAGE_ID = "{{CLUSTER_PROFILE/image_id}}" +CLUSTER_PROFILE__VOLUME_MOUNT = "'{{CLUSTER_PROFILE/volume_mount}}'" + + +@dataclass +class Profile: + node_count: int + instance_type: int + image_id: int + volume_mount: Optional[int] + + +profiles: dict[str, Optional[Profile]] = { + "debug_xs-x86": None, + "medium-x86": Profile( + instance_type="i3.2xlarge", + image_id="ami-04dd23e62ed049936", + node_count=4, + volume_mount=""" | + findmnt /tmp 1> /dev/null + code=$? + if [ $code -ne 0 ]; then + sudo mkfs.ext4 /dev/nvme0n1 + sudo mount -t ext4 /dev/nvme0n1 /tmp + sudo chmod 777 /tmp + fi""", + ), +} + + +if __name__ == "__main__": + content = sys.stdin.read() + + parser = ArgumentParser() + parser.add_argument("--cluster-name") + parser.add_argument("--daft-version") + parser.add_argument("--python-version") + parser.add_argument("--cluster-profile") + args = parser.parse_args() + + if args.cluster_name: + content = content.replace(CLUSTER_NAME_PLACEHOLDER, args.cluster_name) + + if args.daft_version: + content = content.replace(DAFT_VERSION_PLACEHOLDER, f"=={args.daft_version}") + else: + content = content.replace(DAFT_VERSION_PLACEHOLDER, "") + + if args.python_version: + content = content.replace(PYTHON_VERSION_PLACEHOLDER, args.python_version) + + if cluster_profile := args.cluster_profile: + cluster_profile: str + if cluster_profile not in profiles: + raise Exception(f'Cluster profile "{cluster_profile}" not found') + + profile = profiles[cluster_profile] + if profile is None: + raise Exception(f'Cluster profile "{cluster_profile}" not yet implemented') + + assert profile is not None + content = content.replace(CLUSTER_PROFILE__NODE_COUNT, str(profile.node_count)) + content = content.replace(CLUSTER_PROFILE__INSTANCE_TYPE, profile.instance_type) + content = content.replace(CLUSTER_PROFILE__IMAGE_ID, profile.image_id) + if profile.volume_mount: + content = content.replace(CLUSTER_PROFILE__VOLUME_MOUNT, profile.volume_mount) + else: + content = content.replace(CLUSTER_PROFILE__VOLUME_MOUNT, "echo 'Nothing to mount; skipping'") + + print(content) diff --git a/.github/workflows/run-cluster.yaml b/.github/workflows/run-cluster.yaml index 1a296b36a0..3d3e5b1ae6 100644 --- a/.github/workflows/run-cluster.yaml +++ b/.github/workflows/run-cluster.yaml @@ -12,6 +12,13 @@ on: description: The version of python to use required: false default: "3.9" + cluster_profile: + type: choice + options: + - medium-x86 + description: The profile to use for the cluster + required: false + default: medium-x86 command: type: string description: The command to run on the cluster @@ -50,14 +57,16 @@ jobs: uv pip install ray[default] boto3 - name: Dynamically update ray config file run: | - id="ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" - sed -i "s|{{RAY_CLUSTER_NAME}}|$id|g" .github/assets/benchmarking_ray_config.yaml - sed -i 's|{{PYTHON_VERSION}}|${{ inputs.python_version }}|g' .github/assets/benchmarking_ray_config.yaml - if [[ '${{ inputs.daft_version }}' ]]; then - sed -i 's|{{DAFT_VERSION}}|==${{ inputs.daft_version }}|g' .github/assets/benchmarking_ray_config.yaml - else - sed -i 's|{{DAFT_VERSION}}||g' .github/assets/benchmarking_ray_config.yaml - fi + source .venv/bin/activate + (cat .github/assets/template.yaml \ + | python .github/ci-scripts/templatize_ray_config.py \ + --cluster-name "ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \ + --daft-version ${{ inputs.daft_version }} \ + --python-version ${{ inputs.python_version }} \ + --cluster-profile 'medium-x86' + ) >> .github/assets/ray.yaml + echo "Ray configuration file:" >> $GITHUB_STEP_SUMMARY + cat .github/assets/ray.yaml >> $GITHUB_STEP_SUMMARY - name: Download private ssh key run: | KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text) @@ -66,11 +75,11 @@ jobs: - name: Spin up ray cluster run: | source .venv/bin/activate - ray up .github/assets/benchmarking_ray_config.yaml -y + ray up .github/assets/ray.yaml -y - name: Setup connection to ray cluster run: | source .venv/bin/activate - ray dashboard .github/assets/benchmarking_ray_config.yaml & + ray dashboard .github/assets/ray.yaml & - name: Submit job to ray cluster run: | source .venv/bin/activate @@ -86,7 +95,7 @@ jobs: - name: Download log files from ray cluster run: | source .venv/bin/activate - ray rsync-down .github/assets/benchmarking_ray_config.yaml /tmp/ray/session_*/logs ray-daft-logs + ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs ray-daft-logs - name: Kill connection to ray cluster run: | PID=$(lsof -t -i:8265) @@ -103,7 +112,7 @@ jobs: if: always() run: | source .venv/bin/activate - ray down .github/assets/benchmarking_ray_config.yaml -y + ray down .github/assets/ray.yaml -y - name: Upload log files uses: actions/upload-artifact@v4 with: From 5eb618f960ea5a0f4adc019929a89046b31343d6 Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Mon, 25 Nov 2024 14:56:20 -0800 Subject: [PATCH 3/8] Add quotes around arguments; replace hard-coded value with input --- .github/workflows/run-cluster.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-cluster.yaml b/.github/workflows/run-cluster.yaml index 3d3e5b1ae6..ff27f4b747 100644 --- a/.github/workflows/run-cluster.yaml +++ b/.github/workflows/run-cluster.yaml @@ -61,9 +61,9 @@ jobs: (cat .github/assets/template.yaml \ | python .github/ci-scripts/templatize_ray_config.py \ --cluster-name "ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \ - --daft-version ${{ inputs.daft_version }} \ - --python-version ${{ inputs.python_version }} \ - --cluster-profile 'medium-x86' + --daft-version '${{ inputs.daft_version }}' \ + --python-version '${{ inputs.python_version }}' \ + --cluster-profile '${{ inputs.cluster_profile }}' ) >> .github/assets/ray.yaml echo "Ray configuration file:" >> $GITHUB_STEP_SUMMARY cat .github/assets/ray.yaml >> $GITHUB_STEP_SUMMARY From e4547baf4ed190449c3b166fd1b19ad47156f6da Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Mon, 25 Nov 2024 15:03:17 -0800 Subject: [PATCH 4/8] Change the way the template file is printed out --- .github/workflows/run-cluster.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/run-cluster.yaml b/.github/workflows/run-cluster.yaml index ff27f4b747..3d64325cf0 100644 --- a/.github/workflows/run-cluster.yaml +++ b/.github/workflows/run-cluster.yaml @@ -65,8 +65,7 @@ jobs: --python-version '${{ inputs.python_version }}' \ --cluster-profile '${{ inputs.cluster_profile }}' ) >> .github/assets/ray.yaml - echo "Ray configuration file:" >> $GITHUB_STEP_SUMMARY - cat .github/assets/ray.yaml >> $GITHUB_STEP_SUMMARY + cat .github/assets/ray.yaml - name: Download private ssh key run: | KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text) From d8c8dade6f58d9c7453708c252836cc77c62d19d Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Mon, 25 Nov 2024 15:17:52 -0800 Subject: [PATCH 5/8] Add ability to configure `ssh_user` based off of cluster-profile --- .../assets/{template.yaml => .template.yaml} | 26 +++++++++---------- .github/ci-scripts/templatize_ray_config.py | 24 ++++++++++------- .github/workflows/run-cluster.yaml | 2 +- 3 files changed, 28 insertions(+), 24 deletions(-) rename .github/assets/{template.yaml => .template.yaml} (62%) diff --git a/.github/assets/template.yaml b/.github/assets/.template.yaml similarity index 62% rename from .github/assets/template.yaml rename to .github/assets/.template.yaml index f1406befaf..0c7e63a6d5 100644 --- a/.github/assets/template.yaml +++ b/.github/assets/.template.yaml @@ -2,7 +2,7 @@ # GitHub Actions workflow will replace all parameters between `{{...}}` with the # actual values as determined dynamically during runtime of the actual workflow. -cluster_name: '{{CLUSTER_NAME}}' +cluster_name: \{{CLUSTER_NAME}} provider: type: aws @@ -12,40 +12,40 @@ provider: GroupName: ray-autoscaler-c1 auth: - ssh_user: ubuntu + ssh_user: \{{CLUSTER_PROFILE/ssh_user}} ssh_private_key: ~/.ssh/ci-github-actions-ray-cluster-key.pem -max_workers: '{{CLUSTER_PROFILE/node_count}}' +max_workers: \{{CLUSTER_PROFILE/node_count}} available_node_types: ray.head.default: resources: {"CPU": 0} node_config: KeyName: ci-github-actions-ray-cluster-key - InstanceType: '{{CLUSTER_PROFILE/instance_type}}' - ImageId: '{{CLUSTER_PROFILE/image_id}}' + InstanceType: \{{CLUSTER_PROFILE/instance_type}} + ImageId: \{{CLUSTER_PROFILE/image_id}} IamInstanceProfile: Name: ray-autoscaler-v1 ray.worker.default: - min_workers: '{{CLUSTER_PROFILE/node_count}}' - max_workers: '{{CLUSTER_PROFILE/node_count}}' + min_workers: \{{CLUSTER_PROFILE/node_count}} + max_workers: \{{CLUSTER_PROFILE/node_count}} resources: {} node_config: KeyName: ci-github-actions-ray-cluster-key - InstanceType: '{{CLUSTER_PROFILE/instance_type}}' - ImageId: '{{CLUSTER_PROFILE/image_id}}' + InstanceType: \{{CLUSTER_PROFILE/instance_type}} + ImageId: \{{CLUSTER_PROFILE/image_id}} IamInstanceProfile: Name: ray-autoscaler-v1 setup_commands: -- '{{CLUSTER_PROFILE/volume_mount}}' +- \{{CLUSTER_PROFILE/volume_mount}} - sudo snap install aws-cli --classic - curl -LsSf https://astral.sh/uv/install.sh | sh - echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc - source ~/.bashrc -- uv python install {{PYTHON_VERSION}} -- uv python pin {{PYTHON_VERSION}} +- uv python install \{{PYTHON_VERSION}} +- uv python pin \{{PYTHON_VERSION}} - uv v - echo "source $HOME/.venv/bin/activate" >> $HOME/.bashrc - source .venv/bin/activate -- uv pip install pip ray[default] py-spy getdaft{{DAFT_VERSION}} +- uv pip install pip ray[default] py-spy getdaft\{{DAFT_VERSION}} diff --git a/.github/ci-scripts/templatize_ray_config.py b/.github/ci-scripts/templatize_ray_config.py index ea597c1dd1..68c9bdc2f4 100644 --- a/.github/ci-scripts/templatize_ray_config.py +++ b/.github/ci-scripts/templatize_ray_config.py @@ -3,21 +3,23 @@ from dataclasses import dataclass from typing import Optional -CLUSTER_NAME_PLACEHOLDER = "{{CLUSTER_NAME}}" -DAFT_VERSION_PLACEHOLDER = "{{DAFT_VERSION}}" -PYTHON_VERSION_PLACEHOLDER = "{{PYTHON_VERSION}}" -CLUSTER_PROFILE__NODE_COUNT = "'{{CLUSTER_PROFILE/node_count}}'" -CLUSTER_PROFILE__INSTANCE_TYPE = "{{CLUSTER_PROFILE/instance_type}}" -CLUSTER_PROFILE__IMAGE_ID = "{{CLUSTER_PROFILE/image_id}}" -CLUSTER_PROFILE__VOLUME_MOUNT = "'{{CLUSTER_PROFILE/volume_mount}}'" +CLUSTER_NAME_PLACEHOLDER = "\\{{CLUSTER_NAME}}" +DAFT_VERSION_PLACEHOLDER = "\\{{DAFT_VERSION}}" +PYTHON_VERSION_PLACEHOLDER = "\\{{PYTHON_VERSION}}" +CLUSTER_PROFILE__NODE_COUNT = "\\{{CLUSTER_PROFILE/node_count}}" +CLUSTER_PROFILE__INSTANCE_TYPE = "\\{{CLUSTER_PROFILE/instance_type}}" +CLUSTER_PROFILE__IMAGE_ID = "\\{{CLUSTER_PROFILE/image_id}}" +CLUSTER_PROFILE__SSH_USER = "\\{{CLUSTER_PROFILE/ssh_user}}" +CLUSTER_PROFILE__VOLUME_MOUNT = "\\{{CLUSTER_PROFILE/volume_mount}}" @dataclass class Profile: node_count: int - instance_type: int - image_id: int - volume_mount: Optional[int] + instance_type: str + image_id: str + ssh_user: str + volume_mount: Optional[str] profiles: dict[str, Optional[Profile]] = { @@ -26,6 +28,7 @@ class Profile: instance_type="i3.2xlarge", image_id="ami-04dd23e62ed049936", node_count=4, + ssh_user="ubuntu", volume_mount=""" | findmnt /tmp 1> /dev/null code=$? @@ -72,6 +75,7 @@ class Profile: content = content.replace(CLUSTER_PROFILE__NODE_COUNT, str(profile.node_count)) content = content.replace(CLUSTER_PROFILE__INSTANCE_TYPE, profile.instance_type) content = content.replace(CLUSTER_PROFILE__IMAGE_ID, profile.image_id) + content = content.replace(CLUSTER_PROFILE__SSH_USER, profile.ssh_user) if profile.volume_mount: content = content.replace(CLUSTER_PROFILE__VOLUME_MOUNT, profile.volume_mount) else: diff --git a/.github/workflows/run-cluster.yaml b/.github/workflows/run-cluster.yaml index 3d64325cf0..3eacb8b1c9 100644 --- a/.github/workflows/run-cluster.yaml +++ b/.github/workflows/run-cluster.yaml @@ -58,7 +58,7 @@ jobs: - name: Dynamically update ray config file run: | source .venv/bin/activate - (cat .github/assets/template.yaml \ + (cat .github/assets/.template.yaml \ | python .github/ci-scripts/templatize_ray_config.py \ --cluster-name "ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" \ --daft-version '${{ inputs.daft_version }}' \ From aa76f92f0bfaaa34e80714673df20ae95259d96f Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Mon, 25 Nov 2024 18:24:32 -0800 Subject: [PATCH 6/8] Add debug_xs-x86 cluster-profile --- .github/ci-scripts/templatize_ray_config.py | 7 ++++++- .github/workflows/run-cluster.yaml | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/ci-scripts/templatize_ray_config.py b/.github/ci-scripts/templatize_ray_config.py index 68c9bdc2f4..9841e5b4b0 100644 --- a/.github/ci-scripts/templatize_ray_config.py +++ b/.github/ci-scripts/templatize_ray_config.py @@ -23,7 +23,12 @@ class Profile: profiles: dict[str, Optional[Profile]] = { - "debug_xs-x86": None, + "debug_xs-x86": Profile( + instance_type="t3.large", + image_id="ami-04dd23e62ed049936", + node_count=1, + ssh_user="ubuntu", + ), "medium-x86": Profile( instance_type="i3.2xlarge", image_id="ami-04dd23e62ed049936", diff --git a/.github/workflows/run-cluster.yaml b/.github/workflows/run-cluster.yaml index bcc31d3f6c..07cd974df2 100644 --- a/.github/workflows/run-cluster.yaml +++ b/.github/workflows/run-cluster.yaml @@ -16,6 +16,7 @@ on: type: choice options: - medium-x86 + - debug_xs-x86 description: The profile to use for the cluster required: false default: medium-x86 From 9eec4c4897127664ca154b14a12afd05bcf16b9e Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Mon, 25 Nov 2024 18:30:23 -0800 Subject: [PATCH 7/8] Add default value to field in Profile class --- .github/ci-scripts/templatize_ray_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci-scripts/templatize_ray_config.py b/.github/ci-scripts/templatize_ray_config.py index 9841e5b4b0..c871a91419 100644 --- a/.github/ci-scripts/templatize_ray_config.py +++ b/.github/ci-scripts/templatize_ray_config.py @@ -19,7 +19,7 @@ class Profile: instance_type: str image_id: str ssh_user: str - volume_mount: Optional[str] + volume_mount: Optional[str] = None profiles: dict[str, Optional[Profile]] = { From cc5c5f5b74dd848f562131d60d09c0e4d5d5db73 Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Mon, 25 Nov 2024 18:34:43 -0800 Subject: [PATCH 8/8] Update reference to old ray config file --- .github/workflows/run-cluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-cluster.yaml b/.github/workflows/run-cluster.yaml index 07cd974df2..8c403f41f9 100644 --- a/.github/workflows/run-cluster.yaml +++ b/.github/workflows/run-cluster.yaml @@ -95,7 +95,7 @@ jobs: - name: Download log files from ray cluster run: | source .venv/bin/activate - ray rsync-down .github/assets/benchmarking_ray_config.yaml /tmp/ray/session_*/logs ray-daft-logs + ray rsync-down .github/assets/ray.yaml /tmp/ray/session_*/logs ray-daft-logs find ray-daft-logs -depth -name '*:*' -exec bash -c ' for filepath; do dir=$(dirname "$filepath")