From 4b67b852b33057f0f2e6db44e585d5b0b2ce3273 Mon Sep 17 00:00:00 2001
From: Jay Chia <17691182+jaychia@users.noreply.github.com>
Date: Wed, 19 Jun 2024 11:26:11 -0700
Subject: [PATCH] [FEAT] Automatically use Ray Runner if Ray is initialized
 (#2282)

1. Automatically switches Daft to use the Ray Runner if a user calls
`ray.init(...)` before running any Daft querying
2. Also switches behavior to try and deprecate the `DAFT_RAY_ADDRESS`
environment variable so that we can centralize on the normal
`RAY_ADDRESS` behavior

This PR ensures the following behavior:

* If a user explicitly calls `daft.context.set_runner_ray/py`, this
overrides all behavior
* If a user calls daft.context.set_runner_ray with a specified address,
but Ray is already initialized, we warn them that their address is being
ignored
* Otherwise, on first execution Daft will attempt to retrieve the runner
config from the current environment:
    * Check for the `DAFT_RUNNER` environment variable for `RAY`/`PY`
* Check to see if Ray is initialized, and if we aren't running in a Ray
worker: `RAY`
    * Fallback onto: `PY`

Ray connection detection, on driver vs on worker:

<img width="1470" alt="image"
src="https://github.com/Eventual-Inc/Daft/assets/17691182/885a9dbc-0687-42ca-b0fa-6e99675bb00b">

Warning if set_runner_ray is called with an address after Ray is already
initialized:
<img width="1470" alt="image"
src="https://github.com/Eventual-Inc/Daft/assets/17691182/8468220d-ed5a-49e6-b6e4-61c4c24f7e4e">

---------

Co-authored-by: Jay Chia <jaychia94@gmail.com@users.noreply.github.com>
---
 daft/context.py                               | 83 +++++++++++++------
 daft/runners/ray_runner.py                    | 23 +++--
 .../text_to_image/using_cloud_with_ray.ipynb  |  2 +-
 3 files changed, 73 insertions(+), 35 deletions(-)
diff --git a/daft/context.py b/daft/context.py
index 38dcd17501..aa98ff23fc 100644
--- a/daft/context.py
+++ b/daft/context.py
@@ -39,20 +39,54 @@ def _get_runner_config_from_env() -> _RunnerConfig:
     To use:
 
     1. PyRunner: set DAFT_RUNNER=py
-    2. RayRunner: set DAFT_RUNNER=ray and optionally DAFT_RAY_ADDRESS=ray://...
+    2. RayRunner: set DAFT_RUNNER=ray and optionally RAY_ADDRESS=ray://...
     """
-    runner = os.getenv("DAFT_RUNNER") or "PY"
-    if runner.upper() == "RAY":
-        task_backlog_env = os.getenv("DAFT_DEVELOPER_RAY_MAX_TASK_BACKLOG")
+    runner_from_envvar = os.getenv("DAFT_RUNNER")
+    task_backlog_env = os.getenv("DAFT_DEVELOPER_RAY_MAX_TASK_BACKLOG")
+    use_thread_pool_env = os.getenv("DAFT_DEVELOPER_USE_THREAD_POOL")
+    use_thread_pool = bool(int(use_thread_pool_env)) if use_thread_pool_env is not None else None
+
+    ray_is_initialized = False
+    in_ray_worker = False
+    try:
+        import ray
+
+        if ray.is_initialized():
+            ray_is_initialized = True
+            # Check if running inside a Ray worker
+            if ray._private.worker.global_worker.mode == ray.WORKER_MODE:
+                in_ray_worker = True
+    except ImportError:
+        pass
+
+    # Retrieve the runner from environment variables
+    if runner_from_envvar and runner_from_envvar.upper() == "RAY":
+        ray_address = os.getenv("DAFT_RAY_ADDRESS")
+        if ray_address is not None:
+            warnings.warn(
+                "Detected usage of the $DAFT_RAY_ADDRESS environment variable. This will be deprecated, please use $RAY_ADDRESS instead."
+            )
+        else:
+            ray_address = os.getenv("RAY_ADDRESS")
+        return _RayRunnerConfig(
+            address=ray_address,
+            max_task_backlog=int(task_backlog_env) if task_backlog_env else None,
+        )
+    elif runner_from_envvar and runner_from_envvar.upper() == "PY":
+        return _PyRunnerConfig(use_thread_pool=use_thread_pool)
+    elif runner_from_envvar is not None:
+        raise ValueError(f"Unsupported DAFT_RUNNER variable: {runner_from_envvar}")
+
+    # Retrieve the runner from current initialized Ray environment, only if not running in a Ray worker
+    elif ray_is_initialized and not in_ray_worker:
         return _RayRunnerConfig(
-            address=os.getenv("DAFT_RAY_ADDRESS"),
+            address=None,  # No address supplied, use the existing connection
             max_task_backlog=int(task_backlog_env) if task_backlog_env else None,
         )
-    elif runner.upper() == "PY":
-        use_thread_pool_env = os.getenv("DAFT_DEVELOPER_USE_THREAD_POOL")
-        use_thread_pool = bool(int(use_thread_pool_env)) if use_thread_pool_env is not None else None
+
+    # Fall back on PyRunner
+    else:
         return _PyRunnerConfig(use_thread_pool=use_thread_pool)
-    raise ValueError(f"Unsupported DAFT_RUNNER variable: {runner}")
 
 
 @dataclasses.dataclass
@@ -66,7 +100,7 @@ class DaftContext:
     # Non-execution calls (e.g. creation of a dataframe, logical plan building etc) directly reference values in this config
     _daft_planning_config: PyDaftPlanningConfig = PyDaftPlanningConfig()
 
-    _runner_config: _RunnerConfig = dataclasses.field(default_factory=_get_runner_config_from_env)
+    _runner_config: _RunnerConfig | None = None
     _disallow_set_runner: bool = False
     _runner: Runner | None = None
 
@@ -100,13 +134,20 @@ def daft_planning_config(self) -> PyDaftPlanningConfig:
     @property
     def runner_config(self) -> _RunnerConfig:
         with self._lock:
+            return self._get_runner_config()
+
+    def _get_runner_config(self) -> _RunnerConfig:
+        if self._runner_config is not None:
             return self._runner_config
+        self._runner_config = _get_runner_config_from_env()
+        return self._runner_config
 
     def _get_runner(self) -> Runner:
         if self._runner is not None:
             return self._runner
 
-        if self._runner_config.name == "ray":
+        runner_config = self._get_runner_config()
+        if runner_config.name == "ray":
             from daft.runners.ray_runner import RayRunner
 
             assert isinstance(self._runner_config, _RayRunnerConfig)
@@ -114,27 +155,14 @@ def _get_runner(self) -> Runner:
                 address=self._runner_config.address,
                 max_task_backlog=self._runner_config.max_task_backlog,
             )
-        elif self._runner_config.name == "py":
+        elif runner_config.name == "py":
             from daft.runners.pyrunner import PyRunner
 
-            try:
-                import ray
-
-                if ray.is_initialized():
-                    logger.warning(
-                        "WARNING: Daft is NOT using Ray for execution!\n"
-                        "Daft is using the PyRunner but we detected an active Ray connection. "
-                        "If you intended to use the Daft RayRunner, please first run `daft.context.set_runner_ray()` "
-                        "before executing Daft queries."
-                    )
-            except ImportError:
-                pass
-
             assert isinstance(self._runner_config, _PyRunnerConfig)
             self._runner = PyRunner(use_thread_pool=self._runner_config.use_thread_pool)
 
         else:
-            raise NotImplementedError(f"Runner config implemented: {self._runner_config.name}")
+            raise NotImplementedError(f"Runner config not implemented: {runner_config.name}")
 
         # Mark DaftContext as having the runner set, which prevents any subsequent setting of the config
         # after the runner has been initialized once
@@ -165,7 +193,7 @@ def set_runner_ray(
     Alternatively, users can set this behavior via environment variables:
 
     1. DAFT_RUNNER=ray
-    2. Optionally, DAFT_RAY_ADDRESS=ray://...
+    2. Optionally, RAY_ADDRESS=ray://...
 
     **This function will throw an error if called multiple times in the same process.**
 
@@ -178,6 +206,7 @@ def set_runner_ray(
     Returns:
         DaftContext: Daft context after setting the Ray runner
     """
+
     ctx = get_context()
     with ctx._lock:
         if ctx._disallow_set_runner:
diff --git a/daft/runners/ray_runner.py b/daft/runners/ray_runner.py
index d8208ce752..0c48f9da74 100644
--- a/daft/runners/ray_runner.py
+++ b/daft/runners/ray_runner.py
@@ -739,10 +739,19 @@ def __init__(
     ) -> None:
         super().__init__()
         if ray.is_initialized():
-            logger.warning("Ray has already been initialized, Daft will reuse the existing Ray context.")
-        self.ray_context = ray.init(address=address, ignore_reinit_error=True)
+            if address is not None:
+                logger.warning(
+                    "Ray has already been initialized, Daft will reuse the existing Ray context and ignore the "
+                    "supplied address: %s",
+                    address,
+                )
+        else:
+            ray.init(address=address)
+
+        # Check if Ray is running in "client mode" (connected to a Ray cluster via a Ray client)
+        self.ray_client_mode = ray.util.client.ray.get_context().is_connected()
 
-        if isinstance(self.ray_context, ray.client_builder.ClientContext):
+        if self.ray_client_mode:
             # Run scheduler remotely if the cluster is connected remotely.
             self.scheduler_actor = SchedulerActor.options(  # type: ignore
                 name=SCHEDULER_ACTOR_NAME,
@@ -759,7 +768,7 @@ def __init__(
             )
 
     def active_plans(self) -> list[str]:
-        if isinstance(self.ray_context, ray.client_builder.ClientContext):
+        if self.ray_client_mode:
             return ray.get(self.scheduler_actor.active_plans.remote())
         else:
             return self.scheduler.active_plans()
@@ -772,7 +781,7 @@ def _start_plan(
     ) -> str:
         psets = {k: v.values() for k, v in self._part_set_cache.get_all_partition_sets().items()}
         result_uuid = str(uuid.uuid4())
-        if isinstance(self.ray_context, ray.client_builder.ClientContext):
+        if self.ray_client_mode:
             ray.get(
                 self.scheduler_actor.start_plan.remote(
                     daft_execution_config=daft_execution_config,
@@ -795,7 +804,7 @@ def _start_plan(
     def _stream_plan(self, result_uuid: str) -> Iterator[RayMaterializedResult]:
         try:
             while True:
-                if isinstance(self.ray_context, ray.client_builder.ClientContext):
+                if self.ray_client_mode:
                     result = ray.get(self.scheduler_actor.next.remote(result_uuid))
                 else:
                     result = self.scheduler.next(result_uuid)
@@ -808,7 +817,7 @@ def _stream_plan(self, result_uuid: str) -> Iterator[RayMaterializedResult]:
                 yield result
         finally:
             # Generator is out of scope, ensure that state has been cleaned up
-            if isinstance(self.ray_context, ray.client_builder.ClientContext):
+            if self.ray_client_mode:
                 ray.get(self.scheduler_actor.stop_plan.remote(result_uuid))
             else:
                 self.scheduler.stop_plan(result_uuid)
diff --git a/tutorials/text_to_image/using_cloud_with_ray.ipynb b/tutorials/text_to_image/using_cloud_with_ray.ipynb
index d0e44d039c..c14aa26be9 100644
--- a/tutorials/text_to_image/using_cloud_with_ray.ipynb
+++ b/tutorials/text_to_image/using_cloud_with_ray.ipynb
@@ -71,7 +71,7 @@
                 "\n",
                 "To activate the RayRunner, you can either:\n",
                 "\n",
-                "1. Use the `DAFT_RUNNER=ray` and optionally the `DAFT_RAY_ADDRESS` environment variables\n",
+                "1. Use the `DAFT_RUNNER=ray` and optionally the `RAY_ADDRESS` environment variables\n",
                 "2. Call `daft.context.set_runner_ray(...)` at the start of your program.\n",
                 "\n",
                 "We'll demonstrate option 2 here!"