From f6002f9f0b190903518252807985f3604cd59dac Mon Sep 17 00:00:00 2001
From: Raunak Bhagat <r@rabh.io>
Date: Thu, 19 Dec 2024 14:22:07 -0800
Subject: [PATCH] ci: Output results in a CSV format (#3625)

# Overview

This PR makes the visualizations of the `run-cluster` more
user-friendly.

## Usage

If you're ever running a script (on the GHA runner node, not on ray),
you can output whatever files you want to the `$GHA_OUTPUT_DIR`
env-variable, and that file or directory will be uploaded to the GitHub
Actions Summary Page once the job fully completes.

This is what the `.github/ci-scripts/job_runner.py` file does. If you
ever create a new GHA runner script, then outputting any files to the
`$GHA_OUTPUT_DIR` will work.
---
 .github/ci-scripts/job_runner.py   | 18 +++++++++++++++---
 .github/workflows/run-cluster.yaml | 11 +++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/.github/ci-scripts/job_runner.py b/.github/ci-scripts/job_runner.py
index 12c949136f..c36226c1ab 100644
--- a/.github/ci-scripts/job_runner.py
+++ b/.github/ci-scripts/job_runner.py
@@ -1,12 +1,14 @@
 # /// script
 # requires-python = ">=3.12"
-# dependencies = []
+# dependencies = ["ray[default]"]
 # ///
 
 import argparse
 import asyncio
+import csv
 import json
-from dataclasses import dataclass
+import os
+from dataclasses import asdict, dataclass
 from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Optional
@@ -45,6 +47,11 @@ def submit_job(
     env_vars: str,
     enable_ray_tracing: bool,
 ):
+    if "GHA_OUTPUT_DIR" not in os.environ:
+        raise RuntimeError("Output directory environment variable not found; don't know where to store outputs")
+    output_dir = Path(os.environ["GHA_OUTPUT_DIR"])
+    output_dir.mkdir(exist_ok=True, parents=True)
+
     env_vars_dict = parse_env_var_str(env_vars)
     if enable_ray_tracing:
         env_vars_dict["DAFT_ENABLE_RAY_TRACING"] = "1"
@@ -85,7 +92,12 @@ def submit_job(
         result = Result(query=index, duration=duration, error_msg=error_msg)
         results.append(result)
 
-    print(f"{results=}")
+    output_file = output_dir / "out.csv"
+    with open(output_file, mode="w", newline="") as csv_file:
+        writer = csv.DictWriter(csv_file, fieldnames=results[0].__dataclass_fields__.keys())
+        writer.writeheader()
+        for result in results:
+            writer.writerow(asdict(result))
 
 
 if __name__ == "__main__":
diff --git a/.github/workflows/run-cluster.yaml b/.github/workflows/run-cluster.yaml
index f5c41aaf75..7bb35ac765 100644
--- a/.github/workflows/run-cluster.yaml
+++ b/.github/workflows/run-cluster.yaml
@@ -85,6 +85,10 @@ jobs:
         uv v
         source .venv/bin/activate
         uv pip install ray[default] boto3
+        GHA_OUTPUT_DIR=/tmp/outputs
+        mkdir -p $GHA_OUTPUT_DIR
+        echo "Output dir is set to $GHA_OUTPUT_DIR"
+        echo "GHA_OUTPUT_DIR=$GHA_OUTPUT_DIR" >> $GITHUB_ENV
     - name: Dynamically update ray config file
       run: |
         source .venv/bin/activate
@@ -121,6 +125,7 @@ jobs:
           echo 'Invalid command submitted; command cannot be empty'
           exit 1
         fi
+        echo "Output dir: $GHA_OUTPUT_DIR"
         python .github/ci-scripts/job_runner.py \
           --working-dir='${{ inputs.working_dir }}' \
           --entrypoint-script='${{ inputs.entrypoint_script }}' \
@@ -157,6 +162,12 @@ jobs:
       run: |
         source .venv/bin/activate
         ray down .github/assets/ray.yaml -y
+    - name: Upload output dir
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: outputs
+        path: ${{ env.GHA_OUTPUT_DIR }}
     - name: Upload log files
       if: always()
       uses: actions/upload-artifact@v4