From 5dce4fb549a2430580176a1d75df4059adb53c74 Mon Sep 17 00:00:00 2001 From: Raunak Bhagat Date: Fri, 22 Nov 2024 18:23:38 -0800 Subject: [PATCH] [FEAT] Add steps to spin up, submit job, and spin down ray clusters (#3403) # Overview - new steps that: - spin up - submit job - spin down ray clusters ## Note If any of the previous steps fail, the "tear-down" step (responsible for tearing down the ray cluster), will still always run. (The only way this tear-down step would not be run is if the workflow is *manually* cancelled). --- .github/assets/benchmarking_ray_config.yaml | 58 ++++++++++++++++ .github/workflows/build-commit.yaml | 2 +- .github/workflows/run-cluster.yaml | 73 +++++++++++++++++++++ .github/workflows/run-command-on-ray.yaml | 33 ---------- 4 files changed, 132 insertions(+), 34 deletions(-) create mode 100644 .github/assets/benchmarking_ray_config.yaml create mode 100644 .github/workflows/run-cluster.yaml delete mode 100644 .github/workflows/run-command-on-ray.yaml diff --git a/.github/assets/benchmarking_ray_config.yaml b/.github/assets/benchmarking_ray_config.yaml new file mode 100644 index 0000000000..8e098c8b19 --- /dev/null +++ b/.github/assets/benchmarking_ray_config.yaml @@ -0,0 +1,58 @@ +cluster_name: '{{RAY_CLUSTER_NAME}}' + +provider: + type: aws + region: us-west-2 + cache_stopped_nodes: true + security_group: + GroupName: ray-autoscaler-c1 + +auth: + ssh_user: ubuntu + ssh_private_key: ~/.ssh/ci-github-actions-ray-cluster-key.pem + +max_workers: 2 +available_node_types: + ray.head.default: + resources: {"CPU": 0} + node_config: + KeyName: ci-github-actions-ray-cluster-key + InstanceType: i3.2xlarge + ImageId: ami-04dd23e62ed049936 + IamInstanceProfile: + Name: ray-autoscaler-v1 + + ray.worker.default: + min_workers: 2 + max_workers: 2 + resources: {} + node_config: + KeyName: ci-github-actions-ray-cluster-key + InstanceType: i3.2xlarge + ImageId: ami-04dd23e62ed049936 + IamInstanceProfile: + Name: ray-autoscaler-v1 + +setup_commands: +# Mount drive +- | + findmnt /tmp 1> /dev/null + code=$? + if [ $code -ne 0 ]; then + sudo mkfs.ext4 /dev/nvme0n1 + sudo mount -t ext4 /dev/nvme0n1 /tmp + sudo chmod 777 /tmp + fi +# Install dependencies +# GitHub Actions workflow will replace all parameters between `{{...}}` with the +# actual values as determined dynamically during runtime of the actual workflow. +- sudo snap install aws-cli --classic +- curl -LsSf https://astral.sh/uv/install.sh | sh +- echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc +- source ~/.bashrc +- uv python install {{PYTHON_VERSION}} +- uv python pin {{PYTHON_VERSION}} +- uv v +- echo "source $HOME/.venv/bin/activate" >> $HOME/.bashrc +- source .venv/bin/activate +- uv pip install pip ray[default] py-spy getdaft{{DAFT_VERSION}} diff --git a/.github/workflows/build-commit.yaml b/.github/workflows/build-commit.yaml index 210549d434..a6754da847 100644 --- a/.github/workflows/build-commit.yaml +++ b/.github/workflows/build-commit.yaml @@ -1,4 +1,4 @@ -name: Build a Daft commit and store the outputted wheel in AWS S3 +name: build-commit on: workflow_dispatch: diff --git a/.github/workflows/run-cluster.yaml b/.github/workflows/run-cluster.yaml new file mode 100644 index 0000000000..783ec0137f --- /dev/null +++ b/.github/workflows/run-cluster.yaml @@ -0,0 +1,73 @@ +name: run-cluster + +on: + workflow_dispatch: + inputs: + daft_version: + type: string + description: The wheel artifact to use + required: false + python_version: + type: string + description: The version of python to use + required: false + default: "3.9" + +jobs: + run-command: + runs-on: [self-hosted, linux, x64, ci-dev] + timeout-minutes: 15 # Remove for ssh debugging + permissions: + id-token: write + contents: read + steps: + - name: Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 1 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: us-west-2 + role-session-name: run-command-workflow + - name: Install uv, rust, python + uses: ./.github/actions/install + with: + python_version: ${{ inputs.python_version }} + - name: Setup uv environment + run: | + uv v + source .venv/bin/activate + uv pip install ray[default] boto3 + - name: Dynamically update ray config file + run: | + id="ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}" + sed -i "s|{{RAY_CLUSTER_NAME}}|$id|g" .github/assets/benchmarking_ray_config.yaml + sed -i 's|{{PYTHON_VERSION}}|${{ inputs.python_version }}|g' .github/assets/benchmarking_ray_config.yaml + if [[ '${{ inputs.daft_version }}' ]]; then + sed -i 's|{{DAFT_VERSION}}|==${{ inputs.daft_version }}|g' .github/assets/benchmarking_ray_config.yaml + else + sed -i 's|{{DAFT_VERSION}}||g' .github/assets/benchmarking_ray_config.yaml + fi + - name: Download private ssh key + run: | + KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text) + echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem + chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem + - name: Spin up ray cluster + run: | + source .venv/bin/activate + ray up .github/assets/benchmarking_ray_config.yaml -y + - name: Setup connection to ray cluster + run: | + source .venv/bin/activate + ray dashboard .github/assets/benchmarking_ray_config.yaml & + - name: Submit job to ray cluster + run: | + source .venv/bin/activate + ray job submit --address http://localhost:8265 -- python -c "print('Hello, world!')" + - name: Spin down ray cluster + if: always() + run: | + source .venv/bin/activate + ray down .github/assets/benchmarking_ray_config.yaml -y diff --git a/.github/workflows/run-command-on-ray.yaml b/.github/workflows/run-command-on-ray.yaml deleted file mode 100644 index b85e34dd1f..0000000000 --- a/.github/workflows/run-command-on-ray.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: Run some given command on a Ray Cluster - -on: - workflow_dispatch: - inputs: - daft_version: - type: string - description: The wheel artifact to use - required: false - python_version: - type: string - description: The version of python to use - required: false - default: "3.9" - -jobs: - run-tpch: - runs-on: [self-hosted, linux, x64, ci-dev] - timeout-minutes: 15 # Remove for ssh debugging - permissions: - id-token: write - contents: read - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: us-west-2 - role-session-name: run-command-workflow - - uses: ./.github/actions/install - with: - python_version: ${{ inputs.python_version }}