From 2206174014d5c1dd698d4123b2336ebe72135726 Mon Sep 17 00:00:00 2001 From: Andrea Ieri Date: Wed, 18 Dec 2024 18:59:46 -0800 Subject: [PATCH] Stop managing the installation of the NVIDIA driver. (#380) While we initially provided automatic installation of the NVIDIA driver as a convenience, we then ran into the complexity of dealing with users wanting to configure pci-passthrough and/or vgpu, and to possibly move across these configurations post-deployment (see #362, #379). After some more discussions, we agreed that deploying a gpu driver is not the responsibility of hardware-observer, but rather of the principal charm that needs to use the gpu (e.g. nova or kubernetes-worker). This commit therefore drops the functionality of automatically installing the driver and determining if it has been blacklisted for a simpler workflow of installing DCGM only if a driver is found to have been installed and loaded. Fixes: #379 --- src/hardware.py | 6 ++ src/hw_tools.py | 110 ++-------------------- src/service.py | 17 +--- tests/functional/test_charm.py | 15 --- tests/unit/test_hardware.py | 9 +- tests/unit/test_hw_tools.py | 161 ++------------------------------- tests/unit/test_service.py | 10 +- 7 files changed, 40 insertions(+), 288 deletions(-) diff --git a/src/hardware.py b/src/hardware.py index eaedaf6e..aacc1840 100644 --- a/src/hardware.py +++ b/src/hardware.py @@ -4,6 +4,7 @@ import logging import subprocess import typing as t +from pathlib import Path from charms.operator_libs_linux.v0 import apt @@ -102,3 +103,8 @@ def hwinfo(*args: str) -> t.Dict[str, str]: key = item.splitlines()[0].strip() hardware[key] = item return hardware + + +def is_nvidia_driver_loaded() -> bool: + """Determine if an NVIDIA driver has been loaded.""" + return Path("/proc/driver/nvidia/version").exists() diff --git a/src/hw_tools.py b/src/hw_tools.py index 39741668..6c35a051 100644 --- a/src/hw_tools.py +++ b/src/hw_tools.py @@ -5,7 +5,6 @@ import logging import os -import re import shutil import stat import subprocess @@ -43,6 +42,7 @@ LSHW_SUPPORTED_STORAGES, get_bmc_address, hwinfo, + is_nvidia_driver_loaded, lshw, ) from keys import HP_KEYS @@ -62,14 +62,6 @@ def __init__(self, tool: HWTool, path: Path): self.message = f"Tool: {tool} path: {path} size is zero" -class ResourceInstallationError(Exception): - """Exception raised when a hardware tool installation fails.""" - - def __init__(self, tool: HWTool): - """Init.""" - super().__init__(f"Installation failed for tool: {tool}") - - def copy_to_snap_common_bin(source: Path, filename: str) -> None: """Copy file to $SNAP_COMMON/bin folder.""" Path(f"{SNAP_COMMON}/bin").mkdir(parents=False, exist_ok=True) @@ -266,54 +258,6 @@ def _create_custom_metrics(self) -> None: raise err -class NVIDIADriverStrategy(APTStrategyABC): - """NVIDIA driver strategy class.""" - - _name = HWTool.NVIDIA_DRIVER - pkg_pattern = r"nvidia(?:-[a-zA-Z-]*)?-(\d+)(?:-[a-zA-Z]*)?" - - def install(self) -> None: - """Install the NVIDIA driver if not present.""" - if Path("/proc/driver/nvidia/version").exists(): - logger.info("NVIDIA driver already installed in the machine") - return - - with open("/proc/modules", encoding="utf-8") as modules: - if "nouveau" in modules.read(): - logger.error("Nouveau driver is loaded. Unload it before installing NVIDIA driver") - raise ResourceInstallationError(self._name) - - logger.info("Installing NVIDIA driver") - apt.add_package("ubuntu-drivers-common", update_cache=True) - - try: - # This can be changed to check_call and not rely in the output if this is fixed - # https://github.com/canonical/ubuntu-drivers-common/issues/106 - # https://bugs.launchpad.net/ubuntu/+source/ubuntu-drivers-common/+bug/2090502 - result = subprocess.check_output("ubuntu-drivers --gpgpu install".split(), text=True) - subprocess.check_call("modprobe nvidia".split()) - - except subprocess.CalledProcessError as err: - logger.error("Failed to install the NVIDIA driver: %s", err) - raise err - - if "No drivers found for installation" in result: - logger.warning( - "No drivers for the NVIDIA GPU were found. Manual installation is necessary" - ) - raise ResourceInstallationError(self._name) - - logger.info("NVIDIA driver installed") - - def remove(self) -> None: - """Drivers shouldn't be removed by the strategy.""" - return None - - def check(self) -> bool: - """Check if driver was installed.""" - return Path("/proc/driver/nvidia/version").exists() - - class SmartCtlExporterStrategy(SnapStrategy): """SmartCtl strategy class.""" @@ -670,61 +614,25 @@ def disk_hw_verifier() -> Set[HWTool]: def nvidia_gpu_verifier() -> Set[HWTool]: - """Verify if the hardware has NVIDIA gpu and the driver is not blacklisted. + """Verify if an NVIDIA gpu is present and the driver is loaded. - If the sysadmin has blacklisted the nvidia driver (e.g. to configure pci passthrough) - DCGM won't be able to manage the GPU + Depending on the usage of the node (local gpu usage, vgpu configuration, + pci passthrough), a driver must or must not be installed. Since hardware + observer has no way to know what is the intention of the operator, we don't + automate the graphics driver installation. This task should be left to the + principal charm that is going to use the gpu. """ gpus = lshw(class_filter="display") if any("nvidia" in gpu.get("vendor", "").lower() for gpu in gpus): logger.debug("NVIDIA GPU(s) detected") - if not _is_nvidia_module_blacklisted(): + if is_nvidia_driver_loaded(): logger.debug("Enabling DCGM.") return {HWTool.DCGM} - logger.debug("the NVIDIA driver has been blacklisted. Not enabling DCGM.") + logger.debug("no NVIDIA driver has been loaded. Not enabling DCGM.") return set() -def _is_nvidia_module_blacklisted() -> bool: - """Verify if the NVIDIA driver has been blacklisted. - - This is currently done by looking into modprobe config and kernel parameters - NOTE: we can't simply try loading the module with `modprobe -n ` because: - * the driver may not be installed - * we don't know the full name of the module - """ - return ( - _is_nvidia_module_blacklisted_via_modprobe() or _is_nvidia_module_blacklisted_via_cmdline() - ) - - -def _is_nvidia_module_blacklisted_via_modprobe() -> bool: - """Verify if the NVIDIA driver has been blacklisted via modprobe config. - - see the manpages of modprobe and modprobe.d for more details - """ - modprobe_config = subprocess.check_output(["modprobe", "-c"], text=True).split("\n") - - # modprobe normalizes config options to "blacklist MODULE" so no need to - # worry about extra whitespace - return any(opt.startswith("blacklist nvidia") for opt in modprobe_config) - - -def _is_nvidia_module_blacklisted_via_cmdline() -> bool: - """Verify if the NVIDIA driver has been blacklisted via kernel parameters. - - possible formats: module_blacklist= or modprobe.blacklist= followed by a - comma-separated list of modules. See: - https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html - """ - cmdline = Path("/proc/cmdline").read_text(encoding="utf-8") - - return bool( - re.search(r"((?<=module_blacklist)|(?<=modprobe\.blacklist))=[\w,]*nvidia", cmdline) - ) - - def detect_available_tools() -> Set[HWTool]: """Return HWTool detected after checking the hardware.""" return raid_hw_verifier() | bmc_hw_verifier() | disk_hw_verifier() | nvidia_gpu_verifier() diff --git a/src/service.py b/src/service.py index 62d17a63..0fb044db 100644 --- a/src/service.py +++ b/src/service.py @@ -20,14 +20,8 @@ ExporterSettings, HWTool, ) -from hardware import get_bmc_address -from hw_tools import ( - APTStrategyABC, - DCGMExporterStrategy, - NVIDIADriverStrategy, - SmartCtlExporterStrategy, - SnapStrategy, -) +from hardware import get_bmc_address, is_nvidia_driver_loaded +from hw_tools import APTStrategyABC, DCGMExporterStrategy, SmartCtlExporterStrategy, SnapStrategy logger = getLogger(__name__) @@ -424,7 +418,6 @@ def __init__(self, config: ConfigData): """Init.""" self.strategies = [ DCGMExporterStrategy(str(config["dcgm-snap-channel"])), - NVIDIADriverStrategy(), ] super().__init__(config) @@ -434,15 +427,15 @@ def hw_tools() -> Set[HWTool]: return {HWTool.DCGM} def validate_exporter_configs(self) -> Tuple[bool, str]: - """Validate the if the DCGM exporter is able to run.""" + """Validate if the DCGM exporter is able to run.""" valid, msg = super().validate_exporter_configs() if not valid: return False, msg - if not NVIDIADriverStrategy().check(): + if not is_nvidia_driver_loaded(): return ( False, - "Failed to communicate with NVIDIA driver. See more details in the logs", + "The NVIDIA driver isn't installed or loaded. See more details in the logs", ) return valid, msg diff --git a/tests/functional/test_charm.py b/tests/functional/test_charm.py index aba5b4fe..2e5bad2e 100644 --- a/tests/functional/test_charm.py +++ b/tests/functional/test_charm.py @@ -161,21 +161,6 @@ async def test_required_resources(ops_test: OpsTest, required_resources): assert unit.workload_status_message == AppStatus.MISSING_RELATION -@pytest.mark.abort_on_fail -@pytest.mark.realhw -async def test_nvidia_driver_installation(ops_test: OpsTest, nvidia_present, unit): - """Test nvidia driver installation.""" - if not nvidia_present: - pytest.skip("dcgm not in provided collectors, skipping test") - - check_nvidia_driver_cmd = "cat /proc/driver/nvidia/version" - results = await run_command_on_unit(ops_test, unit.name, check_nvidia_driver_cmd) - exists = results.get("return-code") == 0 - - if not exists: - pytest.fail("Error occured during the driver installation. Check the logs.") - - @pytest.mark.abort_on_fail async def test_cos_agent_relation(ops_test: OpsTest, provided_collectors): """Test adding relation with grafana-agent.""" diff --git a/tests/unit/test_hardware.py b/tests/unit/test_hardware.py index 88a515d1..17241cf0 100644 --- a/tests/unit/test_hardware.py +++ b/tests/unit/test_hardware.py @@ -4,7 +4,7 @@ import pytest -from hardware import get_bmc_address, hwinfo, lshw +from hardware import get_bmc_address, hwinfo, is_nvidia_driver_loaded, lshw class TestHwinfo: @@ -173,3 +173,10 @@ def test_get_bmc_address(self, mock_check_output, mock_apt): def test_get_bmc_address_error_handling(self, mock_subprocess, mock_apt): output = get_bmc_address() self.assertEqual(output, None) + + +@pytest.mark.parametrize("path_exists,expected", [(True, True), (False, False)]) +@mock.patch("hardware.Path.exists") +def test_is_nvidia_driver_loaded(mock_path, path_exists, expected): + mock_path.return_value = path_exists + assert is_nvidia_driver_loaded() == expected diff --git a/tests/unit/test_hw_tools.py b/tests/unit/test_hw_tools.py index 1574a30a..75cb8697 100644 --- a/tests/unit/test_hw_tools.py +++ b/tests/unit/test_hw_tools.py @@ -27,11 +27,9 @@ IPMIDCMIStrategy, IPMISELStrategy, IPMISENSORStrategy, - NVIDIADriverStrategy, PercCLIStrategy, ResourceChecksumError, ResourceFileSizeZeroError, - ResourceInstallationError, SAS2IRCUStrategy, SAS3IRCUStrategy, SnapStrategy, @@ -39,9 +37,6 @@ StorCLIStrategy, StrategyABC, TPRStrategyABC, - _is_nvidia_module_blacklisted, - _is_nvidia_module_blacklisted_via_cmdline, - _is_nvidia_module_blacklisted_via_modprobe, _raid_hw_verifier_hwinfo, _raid_hw_verifier_lshw, bmc_hw_verifier, @@ -873,7 +868,7 @@ def test_disk_not_available(self, mock_lshw): @pytest.mark.parametrize( - "lshw_output, blacklist_output, expect", + "lshw_output, driver_loaded, expect", [ ([], False, set()), ( @@ -909,7 +904,7 @@ def test_disk_not_available(self, mock_lshw): "vendor": "Intel Corporation", }, ], - False, + True, {HWTool.DCGM}, ), ( @@ -931,7 +926,7 @@ def test_disk_not_available(self, mock_lshw): "vendor": "NVIDIA Corporation", }, ], - False, + True, {HWTool.DCGM}, ), ( @@ -945,73 +940,19 @@ def test_disk_not_available(self, mock_lshw): "vendor": "NVIDIA Corporation", }, ], - True, + False, set(), ), ], ) -@mock.patch("hw_tools._is_nvidia_module_blacklisted") +@mock.patch("hw_tools.Path.exists") @mock.patch("hw_tools.lshw") -def test_nvidia_gpu_verifier( - mock_lshw, mock_is_nvidia_blacklisted, lshw_output, blacklist_output, expect -): +def test_nvidia_gpu_verifier(mock_lshw, mock_driver_path, lshw_output, driver_loaded, expect): mock_lshw.return_value = lshw_output - mock_is_nvidia_blacklisted.return_value = blacklist_output + mock_driver_path.return_value = driver_loaded assert nvidia_gpu_verifier() == expect -@pytest.mark.parametrize("modprobe_bool", [True, False]) -@pytest.mark.parametrize("cmdline_bool", [True, False]) -@mock.patch("hw_tools._is_nvidia_module_blacklisted_via_modprobe") -@mock.patch("hw_tools._is_nvidia_module_blacklisted_via_cmdline") -def test_is_nvidia_module_blacklisted( - mock_cmdline_blacklisting, mock_modprobe_blacklisting, cmdline_bool, modprobe_bool -): - mock_cmdline_blacklisting.return_value = cmdline_bool - mock_modprobe_blacklisting.return_value = modprobe_bool - assert _is_nvidia_module_blacklisted() == ( - mock_cmdline_blacklisting() or mock_modprobe_blacklisting() - ) - - -@pytest.mark.parametrize( - "modprobe_config, expect", - [ - ("", False), - ("blacklist nvidia", True), - ("blacklist foo", False), - ("foo bar", False), - ("foo bar\nblacklist nvidiadriver", True), - ("blacklist foo\nblacklist bar", False), - ], -) -@mock.patch("hw_tools.subprocess.check_output") -def test_is_nvidia_module_blacklisted_via_modprobe(mock_modprobe, modprobe_config, expect): - mock_modprobe.return_value = modprobe_config - assert _is_nvidia_module_blacklisted_via_modprobe() == expect - - -@pytest.mark.parametrize( - "cmdline_data, expect", - [ - ("", False), - ("foo bar baz", False), - ("module_blacklist=nvidia", True), - ("module_blacklist=nvidiaaaaa", True), - ("module_blacklist=foo,bar,nvidiaaaaa", True), - ("module_blacklist=foo,bar,baz", False), - ("modprobe.blacklist=nvidia", True), - ("modprobe.blacklist=nvidiaaaaa", True), - ("modprobe.blacklist=foo,bar,nvidiaaaaa", True), - ("modprobe.blacklist=foo,bar,baz", False), - ], -) -@mock.patch("hw_tools.Path.read_text") -def test_is_nvidia_module_blacklisted_via_cmdline(mock_cmdline, cmdline_data, expect): - mock_cmdline.return_value = cmdline_data - assert _is_nvidia_module_blacklisted_via_cmdline() == expect - - class TestIPMIHWVerifier(unittest.TestCase): @mock.patch("hw_tools.requests.get") @mock.patch("hw_tools.get_bmc_address", return_value="1.2.3.4") @@ -1254,13 +1195,6 @@ def mock_shutil_copy(): yield mocked_copy -@pytest.fixture -def nvidia_driver_strategy(mock_check_output, mock_apt_lib, mock_path, mock_check_call): - strategy = NVIDIADriverStrategy() - strategy.installed_pkgs = mock_path - yield strategy - - @pytest.fixture def dcgm_exporter_strategy(mock_snap_lib, mock_shutil_copy): yield DCGMExporterStrategy("latest/stable") @@ -1294,87 +1228,6 @@ def test_dcgm_create_custom_metrics_copy_fail( dcgm_exporter_strategy.snap_client.restart.assert_not_called() -def test_nvidia_driver_strategy_install_success( - mock_path, mock_check_output, mock_apt_lib, mock_check_call, nvidia_driver_strategy -): - nvidia_version = mock.MagicMock() - nvidia_version.exists.return_value = False - mock_path.return_value = nvidia_version - - nvidia_driver_strategy.install() - - mock_apt_lib.add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) - mock_check_output.assert_called_once_with("ubuntu-drivers --gpgpu install".split(), text=True) - mock_check_call.assert_called_once_with("modprobe nvidia".split()) - - -def test_install_nvidia_drivers_already_installed( - mock_path, mock_apt_lib, nvidia_driver_strategy, mock_check_output -): - nvidia_version = mock.MagicMock() - nvidia_version.exists.return_value = True - mock_path.return_value = nvidia_version - - nvidia_driver_strategy.install() - - mock_apt_lib.add_package.assert_not_called() - mock_check_output.assert_not_called() - - -def test_install_nvidia_drivers_nouveau_installed(mock_path, nvidia_driver_strategy, mock_apt_lib): - nvidia_version = mock.MagicMock() - nvidia_version.exists.return_value = False - mock_path.return_value = nvidia_version - mocked_open = mock.mock_open(read_data="nouveau") - - with mock.patch("builtins.open", mocked_open): - with pytest.raises(ResourceInstallationError): - nvidia_driver_strategy.install() - - mock_apt_lib.add_package.assert_not_called() - mocked_open.assert_called_once_with("/proc/modules", encoding="utf-8") - - -def test_install_nvidia_drivers_subprocess_exception( - mock_path, mock_check_output, mock_apt_lib, nvidia_driver_strategy -): - nvidia_version = mock.MagicMock() - nvidia_version.exists.return_value = False - mock_path.return_value = nvidia_version - mock_check_output.side_effect = subprocess.CalledProcessError(1, []) - - with pytest.raises(subprocess.CalledProcessError): - nvidia_driver_strategy.install() - - mock_apt_lib.add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) - - -def test_install_nvidia_drivers_no_drivers_found( - mock_path, mock_check_output, mock_apt_lib, nvidia_driver_strategy -): - nvidia_version = mock.MagicMock() - nvidia_version.exists.return_value = False - mock_path.return_value = nvidia_version - mock_check_output.return_value = "No drivers found for installation" - - with pytest.raises(ResourceInstallationError): - nvidia_driver_strategy.install() - - mock_apt_lib.add_package.assert_called_once_with("ubuntu-drivers-common", update_cache=True) - - -def test_nvidia_strategy_remove(nvidia_driver_strategy): - assert nvidia_driver_strategy.remove() is None - - -@pytest.mark.parametrize("present, expected", [(True, True), (False, False)]) -def test_nvidia_strategy_check(nvidia_driver_strategy, mock_path, present, expected): - nvidia_version = mock.MagicMock() - nvidia_version.exists.return_value = present - mock_path.return_value = nvidia_version - assert nvidia_driver_strategy.check() is expected - - @mock.patch("hw_tools.Path.unlink") @mock.patch("hw_tools.Path.exists") @mock.patch("hw_tools.shutil") diff --git a/tests/unit/test_service.py b/tests/unit/test_service.py index f2b17a8c..7f9eb15c 100644 --- a/tests/unit/test_service.py +++ b/tests/unit/test_service.py @@ -731,8 +731,7 @@ def setUp(self) -> None: } ) self.snap_strategy = mock.MagicMock(spec=service.DCGMExporterStrategy) - self.nvidia_strategy = mock.MagicMock(spec=service.NVIDIADriverStrategy) - self.exporter.strategies = [self.snap_strategy, self.nvidia_strategy] + self.exporter.strategies = [self.snap_strategy] def test_exporter_name(self): self.assertEqual(self.exporter.exporter_name, "dcgm") @@ -740,18 +739,19 @@ def test_exporter_name(self): def test_hw_tools(self): self.assertEqual(self.exporter.hw_tools(), {HWTool.DCGM}) - @mock.patch("service.NVIDIADriverStrategy.check", return_value=True) + @mock.patch("service.is_nvidia_driver_loaded", return_value=True) def test_validate_exporter_configs_success(self, _): valid, msg = self.exporter.validate_exporter_configs() self.assertTrue(valid) self.assertEqual(msg, "Exporter config is valid.") - @mock.patch("service.NVIDIADriverStrategy.check", return_value=False) + @mock.patch("service.is_nvidia_driver_loaded", return_value=False) def test_validate_exporter_configs_fails(self, _): valid, msg = self.exporter.validate_exporter_configs() self.assertFalse(valid) self.assertEqual( - msg, "Failed to communicate with NVIDIA driver. See more details in the logs" + msg, + "The NVIDIA driver isn't installed or loaded. See more details in the logs", ) @mock.patch.object(service.BaseExporter, "validate_exporter_configs")