[FEAT] Add stateful actor context and set CUDA_VISIBLE_DEVICES (#3002)

Resolves #2896 Some details about this PR: - I moved the actor-local singleton out of `PyActorPool` into a specialized class `PyStatefulActorSingleton` - I changed GPU resources to be accounted on a per-device level. That resulted in creating the data class `AcquiredResources` to store the resources used by a task or actor. The runner resources includes not only amount of CPU and memory resources, but the exact GPUs that each task/actor is using, which enables setting `CUDA_VISIBLE_DEVICES` in actors. - I also moved resource acquisition and releasing logic into a `PyRunnerResources` class - I added validation that GPU resources must be integers if greater than 1, which means it is no longer accurate to request for `actor_resource_requests * num_workers` anymore, so the actor pool context now asks for them individually.
Eventual-Inc · Oct 18, 2024 · 5795adc · 5795adc
1 parent e4c6f3f
commit 5795adc
Show file tree

Hide file tree

Showing 11 changed files with 528 additions and 186 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/daft/dependencies.py b/daft/dependencies.py
@@ -3,8 +3,6 @@
 from daft.lazy_import import LazyImport
 
 if TYPE_CHECKING:
-    import xml.etree.ElementTree as ET
-
     import fsspec
     import numpy as np
     import pandas as pd
@@ -16,8 +14,6 @@
     import pyarrow.json as pajson
     import pyarrow.parquet as pq
 else:
-    ET = LazyImport("xml.etree.ElementTree")
-
     fsspec = LazyImport("fsspec")
     np = LazyImport("numpy")
     pd = LazyImport("pandas")

diff --git a/daft/internal/gpu.py b/daft/internal/gpu.py
@@ -1,18 +1,40 @@
 from __future__ import annotations
 
-import subprocess
+import warnings
 
-from daft.dependencies import ET
 
+def _raw_device_count_nvml() -> int:
+    """
+    Return number of devices as reported by NVML or zero if NVML discovery/initialization failed.
+
+    Inspired by PyTorch: https://github.com/pytorch/pytorch/blob/88e54de21976aa504e797e47f06b480b9108ef5c/torch/cuda/__init__.py#L711
+    """
+    from ctypes import CDLL, byref, c_int
 
-def cuda_device_count():
-    """Returns the number of CUDA devices detected by nvidia-smi command"""
     try:
-        nvidia_smi_output = subprocess.check_output(["nvidia-smi", "-x", "-q"])
-    except Exception:
+        nvml_h = CDLL("libnvidia-ml.so.1")
+    except OSError:
+        return 0
+    rc = nvml_h.nvmlInit()
+    if rc != 0:
+        warnings.warn("Can't initialize NVML, assuming no CUDA devices.")
         return 0
-    root = ET.fromstring(nvidia_smi_output.decode("utf-8"))
-    attached_gpus = root.find("attached_gpus")
-    if attached_gpus is None:
+    dev_count = c_int(0)
+    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
+    if rc != 0:
+        warnings.warn("Can't get nvml device count, assuming no CUDA devices.")
         return 0
-    return int(attached_gpus.text)
+    del nvml_h
+    return dev_count.value
+
+
+def cuda_visible_devices() -> list[str]:
+    """Get the list of CUDA devices visible to the current process."""
+    import os
+
+    visible_devices_envvar = os.getenv("CUDA_VISIBLE_DEVICES")
+
+    if visible_devices_envvar is None:
+        return [str(i) for i in range(_raw_device_count_nvml())]
+
+    return [device.strip() for device in visible_devices_envvar.split(",") if device.strip()]