Skip to content

Commit

Permalink
Add custom metrics config option (#20)
Browse files Browse the repository at this point in the history
* Adds a new snap config option - dcgm-exporter-metrics-file.
* Adds a test for the new configuration.
* Divides tests into two classes: TestDCGMComponents and TestDCGMConfigs.
  • Loading branch information
Deezzir authored Sep 19, 2024
1 parent a1d4ca8 commit c3b3a91
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 63 deletions.
6 changes: 6 additions & 0 deletions snap/hooks/configure
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,9 @@ if [ -z "$(snapctl get dcgm-exporter-address)" ]; then
# Explictly use default bind address of dcgm-exporter binary
snapctl set dcgm-exporter-address=":9400"
fi

if [ -z "$(snapctl get dcgm-exporter-metrics-file)" ]; then
# Implicitly use default metrics file of dcgm-exporter binary in $SNAP/etc/dcgm-exporter/default-counters.csv
# See details: https://github.com/NVIDIA/dcgm-exporter?tab=readme-ov-file#changing-metrics
snapctl set dcgm-exporter-metrics-file=""
fi
12 changes: 11 additions & 1 deletion snap/local/files/run_dcgm_exporter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,21 @@ set -euo pipefail
# Build the argument list for the dcgm-exporter command
args=()

# Add the dcgm-exporter-address option if it is set. Default: “:9400”
# Add the dcgm-exporter-address option if it is set.
dcgm_exporter_address="$(snapctl get dcgm-exporter-address)"
# Add the dcgm-exporter-metrics-file option if it is set.
dcgm_exporter_metrics_file_path="$SNAP_COMMON/$(snapctl get dcgm-exporter-metrics-file)"

if [ -n "$dcgm_exporter_address" ]; then
args+=("-a" "$dcgm_exporter_address")
fi

# File should be available in the snap data directory under $SNAP_COMMON
if [[ -f "$dcgm_exporter_metrics_file_path" && -s "$dcgm_exporter_metrics_file_path" ]]; then
args+=("-f" "$dcgm_exporter_metrics_file_path")
else
echo "Error: DCGM exporter metrics file not found or empty: $dcgm_exporter_metrics_file_path"
echo "DCGM exporter is falling back to the default metrics at $SNAP/etc/nvidia/dcgm-exporter/default-counters.csv"
fi

exec "$SNAP/bin/dcgm-exporter" "${args[@]}"
2 changes: 1 addition & 1 deletion snap/snapcraft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ parts:
source: https://github.com/NVIDIA/dcgm-exporter.git
source-type: git
source-tag: 3.3.7-3.5.0
# override build to set custom csv file
# override build to get the default csv files from the upstream
override-build: |
craftctl default
mkdir -p $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter
Expand Down
263 changes: 202 additions & 61 deletions tests/functional/test_snap_dcgm.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,227 @@
import json
import os
import subprocess
import urllib.request
from contextlib import contextmanager

import pytest
from tenacity import Retrying, retry, stop_after_delay, wait_fixed
from tenacity import retry, stop_after_delay, wait_fixed


@retry(wait=wait_fixed(5), stop=stop_after_delay(30))
def test_dcgm_exporter():
"""Test of the dcgm-exporter service and its endpoint."""
dcgm_exporter_service = "snap.dcgm.dcgm-exporter"
endpoint = "http://localhost:9400/metrics"
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def _check_service_active(service: str) -> None:
"""Check if a service is active."""
assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {service}".split()
), f"{service} is not running"


@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def _check_service_failed(service: str) -> None:
"""Check if a service is in a failed state."""
assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {dcgm_exporter_service}".split()
), f"{dcgm_exporter_service} is not running"
f"sudo systemctl is-failed --quiet {service}".split()
), f"{service} is running"

# Check the exporter endpoint, will raise an exception if the endpoint is not reachable
response = urllib.request.urlopen(endpoint)

# The output of the exporter endpoint is not tested
# as in a virtual environment it will not have any GPU metrics
assert 200 == response.getcode(), "DCGM exporter endpoint returned an error"
@retry(wait=wait_fixed(5), stop=stop_after_delay(30))
def _check_endpoint(endpoint: str) -> None:
"""Check if an endpoint is reachable."""
response = urllib.request.urlopen(endpoint) # will raise if not reachable
status_code = response.getcode()
assert status_code == 200, f"Endpoint {endpoint} returned status code {status_code}"


def test_dcgm_nv_hostengine():
"""Check the dcgm-nv-hostengine service."""
nv_hostengine_service = "snap.dcgm.nv-hostengine"
nv_hostengine_port = 5555
class TestDCGMComponents:
def test_dcgm_exporter(self) -> None:
"""Test of the dcgm-exporter service and its endpoint."""
dcgm_exporter_service = "snap.dcgm.dcgm-exporter"
endpoint = "http://localhost:9400/metrics"

assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {nv_hostengine_service}".split()
), f"{nv_hostengine_service} is not running"
_check_service_active(dcgm_exporter_service)
# The output of the exporter endpoint is not tested
# as in a virtual environment it will not have any GPU metrics
_check_endpoint(endpoint)

assert 0 == subprocess.call(
f"nc -z localhost {nv_hostengine_port}".split()
), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}"
def test_dcgm_nv_hostengine(self) -> None:
"""Check the dcgm-nv-hostengine service."""
nv_hostengine_service = "snap.dcgm.nv-hostengine"
nv_hostengine_port = 5555

_check_service_active(nv_hostengine_service)

def test_dcgmi():
"""Test of the dcgmi command."""
result = subprocess.run(
"dcgm.dcgmi discovery -l".split(), check=True, capture_output=True, text=True
)
assert 0 == subprocess.call(
f"nc -z localhost {nv_hostengine_port}".split()
), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}"

# Test if the command is working and outputs a table with the GPU ID
# The table will be empty in a virtual environment, but the command should still work
assert "GPU ID" in result.stdout.strip(), "DCGMI didn't produce the expected table"


@pytest.mark.parametrize(
"service, config, new_value",
[
("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"),
("dcgm.nv-hostengine", "nv-hostengine-port", "5566"),
],
)
def test_dcgm_bind_config(service: str, config: str, new_value: str):
"""Test snap bind configuration."""
result = subprocess.run(
"sudo snap get dcgm -d".split(), check=True, capture_output=True, text=True
)
dcgm_snap_config = json.loads(result.stdout.strip())
assert config in dcgm_snap_config, f"{config} is not in the snap configuration"
old_value = dcgm_snap_config[config]
def test_dcgmi(self) -> None:
"""Test of the dcgmi command."""
result = subprocess.check_output("dcgm.dcgmi discovery -l".split(), text=True)

# Test if the command is working and outputs a table with the GPU ID
# The table will be empty in a virtual environment, but the command should still work
assert "GPU ID" in result.strip(), "DCGMI didn't produce the expected table"

def set_config_and_check(value: str):

class TestDCGMConfigs:
@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def set_config(cls, service: str, config: str, value: str) -> None:
"""Set a configuration value for a snap service."""
assert 0 == subprocess.call(
f"sudo snap set dcgm {config}={value}".split()
), f"Failed to set {config} to {new_value}"
), f"Failed to set {config} to {value}"

# restart the service to apply the configuration
subprocess.run(f"sudo snap restart {service}".split(), check=True)
subprocess.check_call(f"sudo snap restart {service}".split())

for attempt in Retrying(wait=wait_fixed(2), stop=stop_after_delay(10)):
with attempt:
assert 0 == subprocess.call(
f"nc -z localhost {value.lstrip(':')}".split()
), f"{service} is not listening on {value}"
@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def unset_config(cls, service: str, config: str) -> None:
"""Unset a configuration value for a snap service."""
assert 0 == subprocess.call(
f"sudo snap unset dcgm {config}".split()
), f"Failed to unset {config}"

# Check new config
set_config_and_check(new_value)
subprocess.check_call(f"sudo snap restart {service}".split())

# Revert back
set_config_and_check(str(old_value))
@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def check_bind_config(cls, service: str, bind: str) -> None:
"""Check if a service is listening on a specific bind."""
assert 0 == subprocess.call(
f"nc -z localhost {bind.lstrip(':')}".split()
), f"{service} is not listening on {bind}"

@classmethod
def get_config(cls, config: str) -> str:
"""Check if a configuration exists in the snap configuration.
:return: The value of the current configuration
"""
result = subprocess.check_output("sudo snap get dcgm -d".split(), text=True)
dcgm_snap_config = json.loads(result.strip())
assert config in dcgm_snap_config, f"{config} is not in the snap configuration"
return str(dcgm_snap_config[config])

@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def check_metric_config(cls, metric_file: str = "") -> None:
"""Check if the metric file is loaded in the dcgm-exporter service.
:param metric_file: The metric file to check for, if empty check if nothing is loaded
"""
result = subprocess.check_output("ps -C dcgm-exporter -o cmd".split(), text=True)

if metric_file:
assert f"-f {metric_file}" in result, f"Metric file {metric_file} is not loaded"
else:
assert "-f" not in result.split(), "Metric file is loaded, but should not be"

@contextmanager
def bind_config(self, service, config, new_value):
"""Set up a context manager to test snap bind configuration."""
old_value = self.get_config(config)
try:
self.set_config(service, config, new_value)
yield
finally:
# Revert back
self.set_config(service, config, old_value)
self.check_bind_config(service, old_value)

@pytest.mark.parametrize(
"service, config, new_value",
[
("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"),
("dcgm.nv-hostengine", "nv-hostengine-port", "5566"),
],
)
def test_valid_bind_config(self, service: str, config: str, new_value: str) -> None:
"""Test valid snap bind configuration."""
with self.bind_config(service, config, new_value):
self.check_bind_config(service, new_value)

@pytest.mark.parametrize(
"service, config, new_value",
[
("dcgm.dcgm-exporter", "dcgm-exporter-address", "test"),
("dcgm.nv-hostengine", "nv-hostengine-port", "test"),
],
)
def test_invalid_bind_config(self, service: str, config: str, new_value: str) -> None:
"""Test invalid snap bind configuration."""
with self.bind_config(service, config, new_value):
_check_service_failed(f"snap.{service}")

@classmethod
@pytest.fixture
def metric_setup(cls):
"""Fixture for metric configuration tests."""
cls.service = "dcgm.dcgm-exporter"
cls.config = "dcgm-exporter-metrics-file"
cls.endpoint = "http://localhost:9400/metrics"
cls.snap_common = "/var/snap/dcgm/common"

cls.get_config(cls.config)

yield

# Revert back
cls.unset_config(cls.service, cls.config)
cls.check_metric_config()

@pytest.mark.usefixtures("metric_setup")
def test_empty_metric(self) -> None:
"""Test with an empty metric file.
Empty files will not be passed to the exporter
"""
metric_file = "empty-metrics.csv"
metric_file_path = os.path.join(self.snap_common, metric_file)
# $SNAP_COMMON requires root permissions to create a file
subprocess.check_call(f"sudo touch {metric_file_path}".split())

self.set_config(self.service, self.config, metric_file)
self.check_metric_config()
_check_endpoint(self.endpoint)

@pytest.mark.usefixtures("metric_setup")
def test_non_existing_metric(self) -> None:
"""Test with a non-existing metric file.
Non-existing files will not be passed to the exporter.
"""
self.set_config(self.service, self.config, "unknown.csv")
self.check_metric_config()
_check_endpoint(self.endpoint)

@pytest.mark.usefixtures("metric_setup")
def test_invalid_metric(self) -> None:
"""Test with an invalid metric file.
The exporter will fail to start due to the invalid metric file
"""
metric_file = "invalid-metrics.csv"
metric_file_path = os.path.join(self.snap_common, metric_file)
# $SNAP_COMMON requires root permissions to create a file
subprocess.check_call(f"echo 'test' | sudo tee {metric_file_path}", shell=True)

self.set_config(self.service, self.config, metric_file)
_check_service_failed(f"snap.{self.service}")

@pytest.mark.usefixtures("metric_setup")
def test_valid_metric(self) -> None:
"""Test with a valid metric file.
The endpoint is reachable with the specified metrics
"""
metric_file = "valid-metrics.csv"
metric_file_path = os.path.join(self.snap_common, metric_file)
subprocess.check_call(
f"echo 'DCGM_FI_DRIVER_VERSION, label, Driver Version' | sudo tee {metric_file_path}",
shell=True,
)

self.set_config(self.service, self.config, metric_file)
self.check_metric_config(metric_file_path)
_check_endpoint(self.endpoint)

0 comments on commit c3b3a91

Please sign in to comment.