Skip to content

Commit

Permalink
[FEAT] Add stateful actor context and set CUDA_VISIBLE_DEVICES (#3002)
Browse files Browse the repository at this point in the history
Resolves #2896

Some details about this PR:
- I moved the actor-local singleton out of `PyActorPool` into a
specialized class `PyStatefulActorSingleton`
- I changed GPU resources to be accounted on a per-device level. That
resulted in creating the data class `AcquiredResources` to store the
resources used by a task or actor. The runner resources includes not
only amount of CPU and memory resources, but the exact GPUs that each
task/actor is using, which enables setting `CUDA_VISIBLE_DEVICES` in
actors.
- I also moved resource acquisition and releasing logic into a
`PyRunnerResources` class
- I added validation that GPU resources must be integers if greater than
1, which means it is no longer accurate to request for
`actor_resource_requests * num_workers` anymore, so the actor pool
context now asks for them individually.
  • Loading branch information
kevinzwang authored Oct 18, 2024
1 parent e4c6f3f commit 5795adc
Show file tree
Hide file tree
Showing 11 changed files with 528 additions and 186 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 0 additions & 4 deletions daft/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from daft.lazy_import import LazyImport

if TYPE_CHECKING:
import xml.etree.ElementTree as ET

import fsspec
import numpy as np
import pandas as pd
Expand All @@ -16,8 +14,6 @@
import pyarrow.json as pajson
import pyarrow.parquet as pq
else:
ET = LazyImport("xml.etree.ElementTree")

fsspec = LazyImport("fsspec")
np = LazyImport("numpy")
pd = LazyImport("pandas")
Expand Down
42 changes: 32 additions & 10 deletions daft/internal/gpu.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,40 @@
from __future__ import annotations

import subprocess
import warnings

from daft.dependencies import ET

def _raw_device_count_nvml() -> int:
"""
Return number of devices as reported by NVML or zero if NVML discovery/initialization failed.
Inspired by PyTorch: https://github.com/pytorch/pytorch/blob/88e54de21976aa504e797e47f06b480b9108ef5c/torch/cuda/__init__.py#L711
"""
from ctypes import CDLL, byref, c_int

def cuda_device_count():
"""Returns the number of CUDA devices detected by nvidia-smi command"""
try:
nvidia_smi_output = subprocess.check_output(["nvidia-smi", "-x", "-q"])
except Exception:
nvml_h = CDLL("libnvidia-ml.so.1")
except OSError:
return 0
rc = nvml_h.nvmlInit()
if rc != 0:
warnings.warn("Can't initialize NVML, assuming no CUDA devices.")
return 0
root = ET.fromstring(nvidia_smi_output.decode("utf-8"))
attached_gpus = root.find("attached_gpus")
if attached_gpus is None:
dev_count = c_int(0)
rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
if rc != 0:
warnings.warn("Can't get nvml device count, assuming no CUDA devices.")
return 0
return int(attached_gpus.text)
del nvml_h
return dev_count.value


def cuda_visible_devices() -> list[str]:
"""Get the list of CUDA devices visible to the current process."""
import os

visible_devices_envvar = os.getenv("CUDA_VISIBLE_DEVICES")

if visible_devices_envvar is None:
return [str(i) for i in range(_raw_device_count_nvml())]

return [device.strip() for device in visible_devices_envvar.split(",") if device.strip()]
Loading

0 comments on commit 5795adc

Please sign in to comment.