Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't install DCGM if the driver has been blacklisted #363

Merged
merged 7 commits into from
Dec 14, 2024
Merged
55 changes: 53 additions & 2 deletions src/hw_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import logging
import os
import re
import shutil
import stat
import subprocess
Expand Down Expand Up @@ -669,9 +670,59 @@ def disk_hw_verifier() -> Set[HWTool]:


def nvidia_gpu_verifier() -> Set[HWTool]:
"""Verify if the hardware has NVIDIA gpu."""
"""Verify if the hardware has NVIDIA gpu and the driver is not blacklisted.

If the sysadmin has blacklisted the nvidia driver (e.g. to configure pci passthrough)
DCGM won't be able to manage the GPU
"""
gpus = lshw(class_filter="display")
return {HWTool.DCGM for gpu in gpus if "nvidia" in gpu.get("vendor", "").lower()}
if any("nvidia" in gpu.get("vendor", "").lower() for gpu in gpus):
logger.debug("NVIDIA GPU(s) detected")
if not _is_nvidia_module_blacklisted():
logger.debug("Enabling DCGM.")
return {HWTool.DCGM}

logger.debug("the NVIDIA driver has been blacklisted. Not enabling DCGM.")
return set()


def _is_nvidia_module_blacklisted() -> bool:
"""Verify if the NVIDIA driver has been blacklisted.

This is currently done by looking into modprobe config and kernel parameters
NOTE: we can't simply try loading the module with `modprobe -n <module>` because:
* the driver may not be installed
* we don't know the full name of the module
"""
return (
_is_nvidia_module_blacklisted_via_modprobe() or _is_nvidia_module_blacklisted_via_cmdline()
)


def _is_nvidia_module_blacklisted_via_modprobe() -> bool:
"""Verify if the NVIDIA driver has been blacklisted via modprobe config.

see the manpages of modprobe and modprobe.d for more details
"""
modprobe_config = subprocess.check_output(["modprobe", "-c"], text=True).split("\n")

# modprobe normalizes config options to "blacklist MODULE" so no need to
# worry about extra whitespace
return any(opt.startswith("blacklist nvidia") for opt in modprobe_config)


def _is_nvidia_module_blacklisted_via_cmdline() -> bool:
"""Verify if the NVIDIA driver has been blacklisted via kernel parameters.

possible formats: module_blacklist= or modprobe.blacklist= followed by a
comma-separated list of modules. See:
https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html
"""
cmdline = Path("/proc/cmdline").read_text(encoding="utf-8")

return bool(
re.search(r"((?<=module_blacklist)|(?<=modprobe\.blacklist))=[\w,]*nvidia", cmdline)
)


def detect_available_tools() -> Set[HWTool]:
Expand Down
82 changes: 79 additions & 3 deletions tests/unit/test_hw_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
StorCLIStrategy,
StrategyABC,
TPRStrategyABC,
_is_nvidia_module_blacklisted,
_is_nvidia_module_blacklisted_via_cmdline,
_is_nvidia_module_blacklisted_via_modprobe,
_raid_hw_verifier_hwinfo,
_raid_hw_verifier_lshw,
bmc_hw_verifier,
Expand Down Expand Up @@ -870,9 +873,9 @@ def test_disk_not_available(self, mock_lshw):


@pytest.mark.parametrize(
"lshw_output, expect",
"lshw_output, blacklist_output, expect",
[
([], set()),
([], False, set()),
(
[
{
Expand All @@ -884,6 +887,7 @@ def test_disk_not_available(self, mock_lshw):
"vendor": "Intel Corporation",
},
],
False,
set(),
),
(
Expand All @@ -905,6 +909,7 @@ def test_disk_not_available(self, mock_lshw):
"vendor": "Intel Corporation",
},
],
False,
{HWTool.DCGM},
),
(
Expand All @@ -926,16 +931,87 @@ def test_disk_not_available(self, mock_lshw):
"vendor": "NVIDIA Corporation",
},
],
False,
{HWTool.DCGM},
),
(
[
{
"id": "display",
"class": "display",
"handle": "PCI:0000:01:00.0",
"description": "VGA compatible controller",
"product": "GA107M [GeForce RTX 3050 Mobile]",
"vendor": "NVIDIA Corporation",
},
],
True,
set(),
),
],
)
@mock.patch("hw_tools._is_nvidia_module_blacklisted")
@mock.patch("hw_tools.lshw")
def test_nvidia_gpu_verifier(mock_lshw, lshw_output, expect):
def test_nvidia_gpu_verifier(
mock_lshw, mock_is_nvidia_blacklisted, lshw_output, blacklist_output, expect
):
mock_lshw.return_value = lshw_output
mock_is_nvidia_blacklisted.return_value = blacklist_output
assert nvidia_gpu_verifier() == expect


@pytest.mark.parametrize("modprobe_bool", [True, False])
@pytest.mark.parametrize("cmdline_bool", [True, False])
aieri marked this conversation as resolved.
Show resolved Hide resolved
@mock.patch("hw_tools._is_nvidia_module_blacklisted_via_modprobe")
@mock.patch("hw_tools._is_nvidia_module_blacklisted_via_cmdline")
def test_is_nvidia_module_blacklisted(
mock_cmdline_blacklisting, mock_modprobe_blacklisting, cmdline_bool, modprobe_bool
):
mock_cmdline_blacklisting.return_value = cmdline_bool
mock_modprobe_blacklisting.return_value = modprobe_bool
assert _is_nvidia_module_blacklisted() == (
mock_cmdline_blacklisting() or mock_modprobe_blacklisting()
)
aieri marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.parametrize(
"modprobe_config, expect",
[
("", False),
("blacklist nvidia", True),
("blacklist foo", False),
("foo bar", False),
("foo bar\nblacklist nvidiadriver", True),
("blacklist foo\nblacklist bar", False),
],
)
@mock.patch("hw_tools.subprocess.check_output")
def test_is_nvidia_module_blacklisted_via_modprobe(mock_modprobe, modprobe_config, expect):
mock_modprobe.return_value = modprobe_config
assert _is_nvidia_module_blacklisted_via_modprobe() == expect


@pytest.mark.parametrize(
"cmdline_data, expect",
[
("", False),
("foo bar baz", False),
("module_blacklist=nvidia", True),
("module_blacklist=nvidiaaaaa", True),
("module_blacklist=foo,bar,nvidiaaaaa", True),
("module_blacklist=foo,bar,baz", False),
("modprobe.blacklist=nvidia", True),
("modprobe.blacklist=nvidiaaaaa", True),
("modprobe.blacklist=foo,bar,nvidiaaaaa", True),
("modprobe.blacklist=foo,bar,baz", False),
],
)
@mock.patch("hw_tools.Path.read_text")
def test_is_nvidia_module_blacklisted_via_cmdline(mock_cmdline, cmdline_data, expect):
mock_cmdline.return_value = cmdline_data
assert _is_nvidia_module_blacklisted_via_cmdline() == expect
aieri marked this conversation as resolved.
Show resolved Hide resolved


class TestIPMIHWVerifier(unittest.TestCase):
@mock.patch("hw_tools.requests.get")
@mock.patch("hw_tools.get_bmc_address", return_value="1.2.3.4")
Expand Down
Loading