Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't install DCGM if the driver has been blacklisted #363

Merged
merged 7 commits into from
Dec 14, 2024
29 changes: 27 additions & 2 deletions src/hw_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@

import logging
import os
import re
import shutil
import stat
import subprocess
from abc import ABCMeta, abstractmethod
from glob import iglob
from itertools import chain
from pathlib import Path
from typing import Dict, List, Set, Tuple

Expand Down Expand Up @@ -669,9 +672,31 @@ def disk_hw_verifier() -> Set[HWTool]:


def nvidia_gpu_verifier() -> Set[HWTool]:
"""Verify if the hardware has NVIDIA gpu."""
"""Verify if the hardware has NVIDIA gpu and the driver is not blacklisted.
If the sysadmin has blacklisted the nvidia driver (e.g. to configure pci passthrough)
DCGM won't be able to manage the GPU
"""
gpus = lshw(class_filter="display")
return {HWTool.DCGM for gpu in gpus if "nvidia" in gpu.get("vendor", "").lower()}
return {
HWTool.DCGM
for gpu in gpus
if "nvidia" in gpu.get("vendor", "").lower() and not _is_nvidia_module_blacklisted()
}


def _is_nvidia_module_blacklisted() -> bool:
module_re = re.compile(r"blacklist\s+nvidia")
for conffile in chain(iglob("/etc/modprobe.d/*.conf"), "/etc/modprobe.conf"):
aieri marked this conversation as resolved.
Show resolved Hide resolved
try:
with open(conffile, "r", encoding="utf-8") as fd:
for line in fd.readline():
Deezzir marked this conversation as resolved.
Show resolved Hide resolved
if module_re.match(line):
return True
except (IsADirectoryError, FileNotFoundError):
# glob may match directories, and modprobe.conf may or may not exist
continue
return False


def detect_available_tools() -> Set[HWTool]:
Expand Down
Loading