Skip to content

Commit

Permalink
Stop managing the installation of the NVIDIA driver. (#380)
Browse files Browse the repository at this point in the history
While we initially provided automatic installation of the NVIDIA driver
as a convenience, we then ran into the complexity of dealing with users
wanting to configure pci-passthrough and/or vgpu, and to possibly move
across these configurations post-deployment (see #362, #379).

After some more discussions, we agreed that deploying a gpu driver is
not the responsibility of hardware-observer, but rather of the principal charm
that needs to use the gpu (e.g. nova or kubernetes-worker).

This commit therefore drops the functionality of automatically
installing the driver and determining if it has been blacklisted for a
simpler workflow of installing DCGM only if a driver is found to have
been installed and loaded.

Fixes: #379
  • Loading branch information
aieri authored Dec 19, 2024
1 parent 3caaea7 commit 2206174
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 288 deletions.
6 changes: 6 additions & 0 deletions src/hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import subprocess
import typing as t
from pathlib import Path

from charms.operator_libs_linux.v0 import apt

Expand Down Expand Up @@ -102,3 +103,8 @@ def hwinfo(*args: str) -> t.Dict[str, str]:
key = item.splitlines()[0].strip()
hardware[key] = item
return hardware


def is_nvidia_driver_loaded() -> bool:
"""Determine if an NVIDIA driver has been loaded."""
return Path("/proc/driver/nvidia/version").exists()
110 changes: 9 additions & 101 deletions src/hw_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import logging
import os
import re
import shutil
import stat
import subprocess
Expand Down Expand Up @@ -43,6 +42,7 @@
LSHW_SUPPORTED_STORAGES,
get_bmc_address,
hwinfo,
is_nvidia_driver_loaded,
lshw,
)
from keys import HP_KEYS
Expand All @@ -62,14 +62,6 @@ def __init__(self, tool: HWTool, path: Path):
self.message = f"Tool: {tool} path: {path} size is zero"


class ResourceInstallationError(Exception):
"""Exception raised when a hardware tool installation fails."""

def __init__(self, tool: HWTool):
"""Init."""
super().__init__(f"Installation failed for tool: {tool}")


def copy_to_snap_common_bin(source: Path, filename: str) -> None:
"""Copy file to $SNAP_COMMON/bin folder."""
Path(f"{SNAP_COMMON}/bin").mkdir(parents=False, exist_ok=True)
Expand Down Expand Up @@ -266,54 +258,6 @@ def _create_custom_metrics(self) -> None:
raise err


class NVIDIADriverStrategy(APTStrategyABC):
"""NVIDIA driver strategy class."""

_name = HWTool.NVIDIA_DRIVER
pkg_pattern = r"nvidia(?:-[a-zA-Z-]*)?-(\d+)(?:-[a-zA-Z]*)?"

def install(self) -> None:
"""Install the NVIDIA driver if not present."""
if Path("/proc/driver/nvidia/version").exists():
logger.info("NVIDIA driver already installed in the machine")
return

with open("/proc/modules", encoding="utf-8") as modules:
if "nouveau" in modules.read():
logger.error("Nouveau driver is loaded. Unload it before installing NVIDIA driver")
raise ResourceInstallationError(self._name)

logger.info("Installing NVIDIA driver")
apt.add_package("ubuntu-drivers-common", update_cache=True)

try:
# This can be changed to check_call and not rely in the output if this is fixed
# https://github.com/canonical/ubuntu-drivers-common/issues/106
# https://bugs.launchpad.net/ubuntu/+source/ubuntu-drivers-common/+bug/2090502
result = subprocess.check_output("ubuntu-drivers --gpgpu install".split(), text=True)
subprocess.check_call("modprobe nvidia".split())

except subprocess.CalledProcessError as err:
logger.error("Failed to install the NVIDIA driver: %s", err)
raise err

if "No drivers found for installation" in result:
logger.warning(
"No drivers for the NVIDIA GPU were found. Manual installation is necessary"
)
raise ResourceInstallationError(self._name)

logger.info("NVIDIA driver installed")

def remove(self) -> None:
"""Drivers shouldn't be removed by the strategy."""
return None

def check(self) -> bool:
"""Check if driver was installed."""
return Path("/proc/driver/nvidia/version").exists()


class SmartCtlExporterStrategy(SnapStrategy):
"""SmartCtl strategy class."""

Expand Down Expand Up @@ -670,61 +614,25 @@ def disk_hw_verifier() -> Set[HWTool]:


def nvidia_gpu_verifier() -> Set[HWTool]:
"""Verify if the hardware has NVIDIA gpu and the driver is not blacklisted.
"""Verify if an NVIDIA gpu is present and the driver is loaded.
If the sysadmin has blacklisted the nvidia driver (e.g. to configure pci passthrough)
DCGM won't be able to manage the GPU
Depending on the usage of the node (local gpu usage, vgpu configuration,
pci passthrough), a driver must or must not be installed. Since hardware
observer has no way to know what is the intention of the operator, we don't
automate the graphics driver installation. This task should be left to the
principal charm that is going to use the gpu.
"""
gpus = lshw(class_filter="display")
if any("nvidia" in gpu.get("vendor", "").lower() for gpu in gpus):
logger.debug("NVIDIA GPU(s) detected")
if not _is_nvidia_module_blacklisted():
if is_nvidia_driver_loaded():
logger.debug("Enabling DCGM.")
return {HWTool.DCGM}

logger.debug("the NVIDIA driver has been blacklisted. Not enabling DCGM.")
logger.debug("no NVIDIA driver has been loaded. Not enabling DCGM.")
return set()


def _is_nvidia_module_blacklisted() -> bool:
"""Verify if the NVIDIA driver has been blacklisted.
This is currently done by looking into modprobe config and kernel parameters
NOTE: we can't simply try loading the module with `modprobe -n <module>` because:
* the driver may not be installed
* we don't know the full name of the module
"""
return (
_is_nvidia_module_blacklisted_via_modprobe() or _is_nvidia_module_blacklisted_via_cmdline()
)


def _is_nvidia_module_blacklisted_via_modprobe() -> bool:
"""Verify if the NVIDIA driver has been blacklisted via modprobe config.
see the manpages of modprobe and modprobe.d for more details
"""
modprobe_config = subprocess.check_output(["modprobe", "-c"], text=True).split("\n")

# modprobe normalizes config options to "blacklist MODULE" so no need to
# worry about extra whitespace
return any(opt.startswith("blacklist nvidia") for opt in modprobe_config)


def _is_nvidia_module_blacklisted_via_cmdline() -> bool:
"""Verify if the NVIDIA driver has been blacklisted via kernel parameters.
possible formats: module_blacklist= or modprobe.blacklist= followed by a
comma-separated list of modules. See:
https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html
"""
cmdline = Path("/proc/cmdline").read_text(encoding="utf-8")

return bool(
re.search(r"((?<=module_blacklist)|(?<=modprobe\.blacklist))=[\w,]*nvidia", cmdline)
)


def detect_available_tools() -> Set[HWTool]:
"""Return HWTool detected after checking the hardware."""
return raid_hw_verifier() | bmc_hw_verifier() | disk_hw_verifier() | nvidia_gpu_verifier()
Expand Down
17 changes: 5 additions & 12 deletions src/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,8 @@
ExporterSettings,
HWTool,
)
from hardware import get_bmc_address
from hw_tools import (
APTStrategyABC,
DCGMExporterStrategy,
NVIDIADriverStrategy,
SmartCtlExporterStrategy,
SnapStrategy,
)
from hardware import get_bmc_address, is_nvidia_driver_loaded
from hw_tools import APTStrategyABC, DCGMExporterStrategy, SmartCtlExporterStrategy, SnapStrategy

logger = getLogger(__name__)

Expand Down Expand Up @@ -424,7 +418,6 @@ def __init__(self, config: ConfigData):
"""Init."""
self.strategies = [
DCGMExporterStrategy(str(config["dcgm-snap-channel"])),
NVIDIADriverStrategy(),
]
super().__init__(config)

Expand All @@ -434,15 +427,15 @@ def hw_tools() -> Set[HWTool]:
return {HWTool.DCGM}

def validate_exporter_configs(self) -> Tuple[bool, str]:
"""Validate the if the DCGM exporter is able to run."""
"""Validate if the DCGM exporter is able to run."""
valid, msg = super().validate_exporter_configs()
if not valid:
return False, msg

if not NVIDIADriverStrategy().check():
if not is_nvidia_driver_loaded():
return (
False,
"Failed to communicate with NVIDIA driver. See more details in the logs",
"The NVIDIA driver isn't installed or loaded. See more details in the logs",
)
return valid, msg

Expand Down
15 changes: 0 additions & 15 deletions tests/functional/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,21 +161,6 @@ async def test_required_resources(ops_test: OpsTest, required_resources):
assert unit.workload_status_message == AppStatus.MISSING_RELATION


@pytest.mark.abort_on_fail
@pytest.mark.realhw
async def test_nvidia_driver_installation(ops_test: OpsTest, nvidia_present, unit):
"""Test nvidia driver installation."""
if not nvidia_present:
pytest.skip("dcgm not in provided collectors, skipping test")

check_nvidia_driver_cmd = "cat /proc/driver/nvidia/version"
results = await run_command_on_unit(ops_test, unit.name, check_nvidia_driver_cmd)
exists = results.get("return-code") == 0

if not exists:
pytest.fail("Error occured during the driver installation. Check the logs.")


@pytest.mark.abort_on_fail
async def test_cos_agent_relation(ops_test: OpsTest, provided_collectors):
"""Test adding relation with grafana-agent."""
Expand Down
9 changes: 8 additions & 1 deletion tests/unit/test_hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pytest

from hardware import get_bmc_address, hwinfo, lshw
from hardware import get_bmc_address, hwinfo, is_nvidia_driver_loaded, lshw


class TestHwinfo:
Expand Down Expand Up @@ -173,3 +173,10 @@ def test_get_bmc_address(self, mock_check_output, mock_apt):
def test_get_bmc_address_error_handling(self, mock_subprocess, mock_apt):
output = get_bmc_address()
self.assertEqual(output, None)


@pytest.mark.parametrize("path_exists,expected", [(True, True), (False, False)])
@mock.patch("hardware.Path.exists")
def test_is_nvidia_driver_loaded(mock_path, path_exists, expected):
mock_path.return_value = path_exists
assert is_nvidia_driver_loaded() == expected
Loading

0 comments on commit 2206174

Please sign in to comment.