From 31a7abc9c829c90d4abc9a8c9015e7f6d82b7e94 Mon Sep 17 00:00:00 2001
From: Andrew Gazelka <andrew.gazelka@gmail.com>
Date: Fri, 22 Nov 2024 17:13:48 -0800
Subject: [PATCH 1/2] [CHORE] connect: add tests for `df.take()` method (#3385)

---
 tests/connect/test_take.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 tests/connect/test_take.py

diff --git a/tests/connect/test_take.py b/tests/connect/test_take.py
new file mode 100644
index 0000000000..2e7809f232
--- /dev/null
+++ b/tests/connect/test_take.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+
+def test_take(spark_session):
+    # Create DataFrame with 10 rows
+    df = spark_session.range(10)
+
+    # Take first 5 rows and collect
+    result = df.take(5)
+
+    # Verify the expected values
+    expected = df.limit(5).collect()
+
+    assert result == expected
+
+    # Test take with more rows than exist
+    result_large = df.take(20)
+    expected_large = df.collect()
+    assert result_large == expected_large  # Should return all existing rows

From 5dce4fb549a2430580176a1d75df4059adb53c74 Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <rabhagat31@gmail.com>
Date: Fri, 22 Nov 2024 18:23:38 -0800
Subject: [PATCH 2/2] [FEAT] Add steps to spin up, submit job, and spin down
 ray clusters (#3403)

# Overview
- new steps that:
  - spin up
  - submit job
  - spin down ray clusters

## Note
If any of the previous steps fail, the "tear-down" step (responsible for
tearing down the ray cluster), will still always run. (The only way this
tear-down step would not be run is if the workflow is *manually*
cancelled).
---
 .github/assets/benchmarking_ray_config.yaml | 58 ++++++++++++++++
 .github/workflows/build-commit.yaml         |  2 +-
 .github/workflows/run-cluster.yaml          | 73 +++++++++++++++++++++
 .github/workflows/run-command-on-ray.yaml   | 33 ----------
 4 files changed, 132 insertions(+), 34 deletions(-)
 create mode 100644 .github/assets/benchmarking_ray_config.yaml
 create mode 100644 .github/workflows/run-cluster.yaml
 delete mode 100644 .github/workflows/run-command-on-ray.yaml

diff --git a/.github/assets/benchmarking_ray_config.yaml b/.github/assets/benchmarking_ray_config.yaml
new file mode 100644
index 0000000000..8e098c8b19
--- /dev/null
+++ b/.github/assets/benchmarking_ray_config.yaml
@@ -0,0 +1,58 @@
+cluster_name: '{{RAY_CLUSTER_NAME}}'
+
+provider:
+  type: aws
+  region: us-west-2
+  cache_stopped_nodes: true
+  security_group:
+    GroupName: ray-autoscaler-c1
+
+auth:
+  ssh_user: ubuntu
+  ssh_private_key: ~/.ssh/ci-github-actions-ray-cluster-key.pem
+
+max_workers: 2
+available_node_types:
+  ray.head.default:
+    resources: {"CPU": 0}
+    node_config:
+      KeyName: ci-github-actions-ray-cluster-key
+      InstanceType: i3.2xlarge
+      ImageId: ami-04dd23e62ed049936
+      IamInstanceProfile:
+        Name: ray-autoscaler-v1
+
+  ray.worker.default:
+    min_workers: 2
+    max_workers: 2
+    resources: {}
+    node_config:
+      KeyName: ci-github-actions-ray-cluster-key
+      InstanceType: i3.2xlarge
+      ImageId: ami-04dd23e62ed049936
+      IamInstanceProfile:
+        Name: ray-autoscaler-v1
+
+setup_commands:
+# Mount drive
+- |
+  findmnt /tmp 1> /dev/null
+  code=$?
+  if [ $code -ne 0 ]; then
+    sudo mkfs.ext4 /dev/nvme0n1
+    sudo mount -t ext4 /dev/nvme0n1 /tmp
+    sudo chmod 777 /tmp
+  fi
+# Install dependencies
+# GitHub Actions workflow will replace all parameters between `{{...}}` with the
+# actual values as determined dynamically during runtime of the actual workflow.
+- sudo snap install aws-cli --classic
+- curl -LsSf https://astral.sh/uv/install.sh | sh
+- echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc
+- source ~/.bashrc
+- uv python install {{PYTHON_VERSION}}
+- uv python pin {{PYTHON_VERSION}}
+- uv v
+- echo "source $HOME/.venv/bin/activate" >> $HOME/.bashrc
+- source .venv/bin/activate
+- uv pip install pip ray[default] py-spy getdaft{{DAFT_VERSION}}
diff --git a/.github/workflows/build-commit.yaml b/.github/workflows/build-commit.yaml
index 210549d434..a6754da847 100644
--- a/.github/workflows/build-commit.yaml
+++ b/.github/workflows/build-commit.yaml
@@ -1,4 +1,4 @@
-name: Build a Daft commit and store the outputted wheel in AWS S3
+name: build-commit
 
 on:
   workflow_dispatch:
diff --git a/.github/workflows/run-cluster.yaml b/.github/workflows/run-cluster.yaml
new file mode 100644
index 0000000000..783ec0137f
--- /dev/null
+++ b/.github/workflows/run-cluster.yaml
@@ -0,0 +1,73 @@
+name: run-cluster
+
+on:
+  workflow_dispatch:
+    inputs:
+      daft_version:
+        type: string
+        description: The wheel artifact to use
+        required: false
+      python_version:
+        type: string
+        description: The version of python to use
+        required: false
+        default: "3.9"
+
+jobs:
+  run-command:
+    runs-on: [self-hosted, linux, x64, ci-dev]
+    timeout-minutes: 15 # Remove for ssh debugging
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+    - name: Checkout repo
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 1
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: us-west-2
+        role-session-name: run-command-workflow
+    - name: Install uv, rust, python
+      uses: ./.github/actions/install
+      with:
+        python_version: ${{ inputs.python_version }}
+    - name: Setup uv environment
+      run: |
+        uv v
+        source .venv/bin/activate
+        uv pip install ray[default] boto3
+    - name: Dynamically update ray config file
+      run: |
+        id="ray-ci-run-${{ github.run_id }}_${{ github.run_attempt }}"
+        sed -i "s|{{RAY_CLUSTER_NAME}}|$id|g" .github/assets/benchmarking_ray_config.yaml
+        sed -i 's|{{PYTHON_VERSION}}|${{ inputs.python_version }}|g' .github/assets/benchmarking_ray_config.yaml
+        if [[ '${{ inputs.daft_version }}' ]]; then
+          sed -i 's|{{DAFT_VERSION}}|==${{ inputs.daft_version }}|g' .github/assets/benchmarking_ray_config.yaml
+        else
+          sed -i 's|{{DAFT_VERSION}}||g' .github/assets/benchmarking_ray_config.yaml
+        fi
+    - name: Download private ssh key
+      run: |
+        KEY=$(aws secretsmanager get-secret-value --secret-id ci-github-actions-ray-cluster-key-3 --query SecretString --output text)
+        echo "$KEY" >> ~/.ssh/ci-github-actions-ray-cluster-key.pem
+        chmod 600 ~/.ssh/ci-github-actions-ray-cluster-key.pem
+    - name: Spin up ray cluster
+      run: |
+        source .venv/bin/activate
+        ray up .github/assets/benchmarking_ray_config.yaml -y
+    - name: Setup connection to ray cluster
+      run: |
+        source .venv/bin/activate
+        ray dashboard .github/assets/benchmarking_ray_config.yaml &
+    - name: Submit job to ray cluster
+      run: |
+        source .venv/bin/activate
+        ray job submit --address http://localhost:8265 -- python -c "print('Hello, world!')"
+    - name: Spin down ray cluster
+      if: always()
+      run: |
+        source .venv/bin/activate
+        ray down .github/assets/benchmarking_ray_config.yaml -y
diff --git a/.github/workflows/run-command-on-ray.yaml b/.github/workflows/run-command-on-ray.yaml
deleted file mode 100644
index b85e34dd1f..0000000000
--- a/.github/workflows/run-command-on-ray.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Run some given command on a Ray Cluster
-
-on:
-  workflow_dispatch:
-    inputs:
-      daft_version:
-        type: string
-        description: The wheel artifact to use
-        required: false
-      python_version:
-        type: string
-        description: The version of python to use
-        required: false
-        default: "3.9"
-
-jobs:
-  run-tpch:
-    runs-on: [self-hosted, linux, x64, ci-dev]
-    timeout-minutes: 15 # Remove for ssh debugging
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        fetch-depth: 1
-    - uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: us-west-2
-        role-session-name: run-command-workflow
-    - uses: ./.github/actions/install
-      with:
-        python_version: ${{ inputs.python_version }}