Skip to content

Commit

Permalink
Add dcgm collector and test for the exporter snap
Browse files Browse the repository at this point in the history
  • Loading branch information
Deezzir committed Nov 22, 2024
1 parent fad3479 commit 70fb2e1
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 4 deletions.
1 change: 1 addition & 0 deletions tests/functional/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def pytest_addoption(parser):
"poweredge_raid",
"lsi_sas_2",
"lsi_sas_3",
"dcgm",
],
help="Provide space-separated list of collectors for testing with real hardware.",
)
Expand Down
46 changes: 42 additions & 4 deletions tests/functional/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
HardwareExporterConfigError,
MetricsFetchError,
assert_metrics,
assert_snap_installed,
get_hardware_exporter_config,
get_metrics_output,
run_command_on_unit,
Expand Down Expand Up @@ -53,6 +54,12 @@ class AppStatus(str, Enum):
CHECKSUM_ERROR = "Fail strategies: "
INVALID_CONFIG_EXPORTER_LOG_LEVEL = "Invalid config: 'exporter-log-level'"
INVALID_REDFISH_CREDS = "Invalid config: 'redfish-username' or 'redfish-password'"
NO_NVIDIA_DRIVER_DETECTED = (
"Failed to communicate with NVIDIA driver. See more details in the logs"
)
MANUAL_NVIDIA_DRIVER_INSTALL = (
"No drivers for the NVIDIA GPU were found. Manual installation is necessary"
)


@pytest.mark.abort_on_fail
Expand Down Expand Up @@ -160,6 +167,25 @@ async def test_required_resources(ops_test: OpsTest, provided_collectors, requir
assert unit.workload_status_message == AppStatus.MISSING_RELATION


@pytest.mark.abort_on_fail
async def test_nvidia_driver_installation(ops_test: OpsTest, provided_collectors, unit):
"""Test nvidia driver installation."""
if "dcgm" not in provided_collectors:
pytest.skip("dcgm not in provided collectors, skipping test")

check_nvidia_driver_cmd = "cat /proc/driver/nvidia/version"
results = await run_command_on_unit(ops_test, unit.name, check_nvidia_driver_cmd)
exists = results.get("return-code") == 0

if not exists:
if unit.workload_status_message == AppStatus.MANUAL_NVIDIA_DRIVER_INSTALL:
pytest.fail("This machine requires manual installation of NVIDIA driver.")
elif unit.workload_status_message == AppStatus.NO_NVIDIA_DRIVER_DETECTED:
pytest.fail("Nvidia GPU detected, the tests should be run on the real hardware.")
else:
pytest.fail("Error occured during the driver installation.")


@pytest.mark.abort_on_fail
async def test_cos_agent_relation(ops_test: OpsTest, provided_collectors):
"""Test adding relation with grafana-agent."""
Expand Down Expand Up @@ -358,16 +384,28 @@ async def test_redfish_client_timeout_config(self, app, unit, ops_test, provided
async def test_smarctl_exporter_snap_available(self, ops_test, app, unit):
"""Test if smartctl exporter snap is installed and ranning on the unit."""
snap_name = "smartctl-exporter"
cmd = f"snap list {snap_name}"
results = await run_command_on_unit(ops_test, unit.name, cmd)
assert results.get("return-code") == 0
assert snap_name in results.get("stdout").strip()
if not assert_snap_installed(ops_test, unit.name, snap_name):
pytest.fail(f"{snap_name} snap is not installed on the unit.")

check_active_cmd = "systemctl is-active snap.smartctl-exporter.smartctl-exporter"
results = await run_command_on_unit(ops_test, unit.name, check_active_cmd)
assert results.get("return-code") == 0
assert results.get("stdout").strip() == "active"

async def test_dcgm_exporter_snap_available(self, ops_test, app, unit, provided_collectors):
"""Test if dcgm exporter snap is installed and ranning on the unit."""
if "dcgm" not in provided_collectors:
pytest.skip("dcgm not in provided collectors, skipping test")

snap_name = "dcgm"
if not assert_snap_installed(ops_test, unit.name, snap_name):
pytest.fail(f"{snap_name} snap is not installed on the unit.")

check_active_cmd = "systemctl is-active snap.dcgm.dcgm-exporter"
results = await run_command_on_unit(ops_test, unit.name, check_active_cmd)
assert results.get("return-code") == 0
assert results.get("stdout").strip() == "active"

async def test_metrics_available(self, app, unit, ops_test):
"""Test if metrics are available at the expected endpoint on unit."""
# takes some time for exporter to start and metrics to be available
Expand Down
9 changes: 9 additions & 0 deletions tests/functional/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@ async def get_metrics_output(ops_test, unit_name) -> Optional[dict[str, list[Met
return parsed_metrics


async def assert_snap_installed(ops_test, unit_name: str, snap_name: str) -> bool:
"""Assert whether snap is installed on the model."""
cmd = f"snap list {snap_name}"
results = await run_command_on_unit(ops_test, unit_name, cmd)
if results.get("return-code") > 0 or snap_name not in results.get("stdout"):
return False
return True


def assert_metrics(metrics: list[Metric], expected_metric_values_map: dict[str, float]) -> bool:
"""Assert whether values in obtained list of metrics for a collector are as expected.
Expand Down

0 comments on commit 70fb2e1

Please sign in to comment.