Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom metrics config option #20

Merged
merged 24 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions snap/hooks/configure
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,9 @@ if [ -z "$(snapctl get dcgm-exporter-address)" ]; then
# Explictly use default bind address of dcgm-exporter binary
snapctl set dcgm-exporter-address=":9400"
fi

if [ -z "$(snapctl get dcgm-exporter-metrics-file)" ]; then
# Implicitly use default metrics file of dcgm-exporter binary in $SNAP/etc/dcgm-exporter/default-counters.csv
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
# See details: https://github.com/NVIDIA/dcgm-exporter?tab=readme-ov-file#changing-metrics
snapctl set dcgm-exporter-metrics-file=""
fi
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
12 changes: 11 additions & 1 deletion snap/local/files/run_dcgm_exporter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,21 @@ set -euo pipefail
# Build the argument list for the dcgm-exporter command
args=()

# Add the dcgm-exporter-address option if it is set. Default: “:9400”
# Add the dcgm-exporter-address option if it is set.
dcgm_exporter_address="$(snapctl get dcgm-exporter-address)"
# Add the dcgm-exporter-metrics-file option if it is set.
dcgm_exporter_metrics_file_path="$SNAP_COMMON/$(snapctl get dcgm-exporter-metrics-file)"

if [ -n "$dcgm_exporter_address" ]; then
args+=("-a" "$dcgm_exporter_address")
fi

# File should be available in the snap data directory under $SNAP_COMMON
if [[ -f "$dcgm_exporter_metrics_file_path" && -s "$dcgm_exporter_metrics_file_path" ]]; then
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
args+=("-f" "$dcgm_exporter_metrics_file_path")
else
echo "Error: DCGM exporter metrics file not found or empty: $dcgm_exporter_metrics_file_path"
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
echo "DCGM exporter is falling back to the default metrics at $SNAP/etc/nvidia/dcgm-exporter/default-counters.csv"
fi

exec "$SNAP/bin/dcgm-exporter" "${args[@]}"
2 changes: 1 addition & 1 deletion snap/snapcraft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ parts:
source: https://github.com/NVIDIA/dcgm-exporter.git
source-type: git
source-tag: 3.3.7-3.5.0
# override build to set custom csv file
# override build to get the default csv files from the upstream
override-build: |
craftctl default
mkdir -p $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter
Expand Down
263 changes: 202 additions & 61 deletions tests/functional/test_snap_dcgm.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,227 @@
import json
import os
import subprocess
import urllib.request
from contextlib import contextmanager

import pytest
from tenacity import Retrying, retry, stop_after_delay, wait_fixed
from tenacity import retry, stop_after_delay, wait_fixed


@retry(wait=wait_fixed(5), stop=stop_after_delay(30))
def test_dcgm_exporter():
"""Test of the dcgm-exporter service and its endpoint."""
dcgm_exporter_service = "snap.dcgm.dcgm-exporter"
endpoint = "http://localhost:9400/metrics"
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def _check_service_active(service: str) -> None:
"""Check if a service is active."""
assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {service}".split()
), f"{service} is not running"


@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def _check_service_failed(service: str) -> None:
"""Check if a service is in a failed state."""
assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {dcgm_exporter_service}".split()
), f"{dcgm_exporter_service} is not running"
f"sudo systemctl is-failed --quiet {service}".split()
), f"{service} is running"

# Check the exporter endpoint, will raise an exception if the endpoint is not reachable
response = urllib.request.urlopen(endpoint)

# The output of the exporter endpoint is not tested
# as in a virtual environment it will not have any GPU metrics
assert 200 == response.getcode(), "DCGM exporter endpoint returned an error"
@retry(wait=wait_fixed(5), stop=stop_after_delay(30))
def _check_endpoint(endpoint: str) -> None:
"""Check if an endpoint is reachable."""
response = urllib.request.urlopen(endpoint) # will raise if not reachable
status_code = response.getcode()
assert status_code == 200, f"Endpoint {endpoint} returned status code {status_code}"


def test_dcgm_nv_hostengine():
"""Check the dcgm-nv-hostengine service."""
nv_hostengine_service = "snap.dcgm.nv-hostengine"
nv_hostengine_port = 5555
class TestDCGMComponents:
def test_dcgm_exporter(self) -> None:
"""Test of the dcgm-exporter service and its endpoint."""
dcgm_exporter_service = "snap.dcgm.dcgm-exporter"
endpoint = "http://localhost:9400/metrics"

assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {nv_hostengine_service}".split()
), f"{nv_hostengine_service} is not running"
_check_service_active(dcgm_exporter_service)
# The output of the exporter endpoint is not tested
# as in a virtual environment it will not have any GPU metrics
_check_endpoint(endpoint)

assert 0 == subprocess.call(
f"nc -z localhost {nv_hostengine_port}".split()
), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}"
def test_dcgm_nv_hostengine(self) -> None:
"""Check the dcgm-nv-hostengine service."""
nv_hostengine_service = "snap.dcgm.nv-hostengine"
nv_hostengine_port = 5555

_check_service_active(nv_hostengine_service)

def test_dcgmi():
"""Test of the dcgmi command."""
result = subprocess.run(
"dcgm.dcgmi discovery -l".split(), check=True, capture_output=True, text=True
)
assert 0 == subprocess.call(
f"nc -z localhost {nv_hostengine_port}".split()
), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}"

# Test if the command is working and outputs a table with the GPU ID
# The table will be empty in a virtual environment, but the command should still work
assert "GPU ID" in result.stdout.strip(), "DCGMI didn't produce the expected table"


@pytest.mark.parametrize(
"service, config, new_value",
[
("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"),
("dcgm.nv-hostengine", "nv-hostengine-port", "5566"),
],
)
def test_dcgm_bind_config(service: str, config: str, new_value: str):
"""Test snap bind configuration."""
result = subprocess.run(
"sudo snap get dcgm -d".split(), check=True, capture_output=True, text=True
)
dcgm_snap_config = json.loads(result.stdout.strip())
assert config in dcgm_snap_config, f"{config} is not in the snap configuration"
old_value = dcgm_snap_config[config]
def test_dcgmi(self) -> None:
"""Test of the dcgmi command."""
result = subprocess.check_output("dcgm.dcgmi discovery -l".split(), text=True)

# Test if the command is working and outputs a table with the GPU ID
# The table will be empty in a virtual environment, but the command should still work
assert "GPU ID" in result.strip(), "DCGMI didn't produce the expected table"

def set_config_and_check(value: str):

class TestDCGMConfigs:
@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def set_config(cls, service: str, config: str, value: str) -> None:
"""Set a configuration value for a snap service."""
assert 0 == subprocess.call(
f"sudo snap set dcgm {config}={value}".split()
), f"Failed to set {config} to {new_value}"
), f"Failed to set {config} to {value}"

# restart the service to apply the configuration
subprocess.run(f"sudo snap restart {service}".split(), check=True)
subprocess.check_call(f"sudo snap restart {service}".split())

for attempt in Retrying(wait=wait_fixed(2), stop=stop_after_delay(10)):
with attempt:
assert 0 == subprocess.call(
f"nc -z localhost {value.lstrip(':')}".split()
), f"{service} is not listening on {value}"
@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def unset_config(cls, service: str, config: str) -> None:
"""Unset a configuration value for a snap service."""
assert 0 == subprocess.call(
f"sudo snap unset dcgm {config}".split()
), f"Failed to unset {config}"

# Check new config
set_config_and_check(new_value)
subprocess.check_call(f"sudo snap restart {service}".split())

# Revert back
set_config_and_check(str(old_value))
@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def check_bind_config(cls, service: str, bind: str) -> None:
"""Check if a service is listening on a specific bind."""
assert 0 == subprocess.call(
f"nc -z localhost {bind.lstrip(':')}".split()
), f"{service} is not listening on {bind}"

@classmethod
def get_config(cls, config: str) -> str:
"""Check if a configuration exists in the snap configuration.

:return: The value of the current configuration
"""
result = subprocess.check_output("sudo snap get dcgm -d".split(), text=True)
dcgm_snap_config = json.loads(result.strip())
assert config in dcgm_snap_config, f"{config} is not in the snap configuration"
return str(dcgm_snap_config[config])

@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def check_metric_config(cls, metric_file: str = "") -> None:
"""Check if the metric file is loaded in the dcgm-exporter service.

:param metric_file: The metric file to check for, if empty check if nothing is loaded
"""
result = subprocess.check_output("ps -C dcgm-exporter -o cmd".split(), text=True)

if metric_file:
assert f"-f {metric_file}" in result, f"Metric file {metric_file} is not loaded"
else:
assert "-f" not in result.split(), "Metric file is loaded, but should not be"

@contextmanager
def bind_config(self, service, config, new_value):
"""Set up a context manager to test snap bind configuration."""
old_value = self.get_config(config)
try:
self.set_config(service, config, new_value)
yield
finally:
# Revert back
self.set_config(service, config, old_value)
self.check_bind_config(service, old_value)

@pytest.mark.parametrize(
"service, config, new_value",
[
("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"),
("dcgm.nv-hostengine", "nv-hostengine-port", "5566"),
],
)
def test_valid_bind_config(self, service: str, config: str, new_value: str) -> None:
"""Test valid snap bind configuration."""
with self.bind_config(service, config, new_value):
self.check_bind_config(service, new_value)

@pytest.mark.parametrize(
"service, config, new_value",
[
("dcgm.dcgm-exporter", "dcgm-exporter-address", "test"),
("dcgm.nv-hostengine", "nv-hostengine-port", "test"),
],
)
def test_invalid_bind_config(self, service: str, config: str, new_value: str) -> None:
"""Test invalid snap bind configuration."""
with self.bind_config(service, config, new_value):
_check_service_failed(f"snap.{service}")

@classmethod
@pytest.fixture
def metric_setup(cls):
"""Fixture for metric configuration tests."""
cls.service = "dcgm.dcgm-exporter"
cls.config = "dcgm-exporter-metrics-file"
cls.endpoint = "http://localhost:9400/metrics"
cls.snap_common = "/var/snap/dcgm/common"

cls.get_config(cls.config)

yield

# Revert back
cls.unset_config(cls.service, cls.config)
cls.check_metric_config()

@pytest.mark.usefixtures("metric_setup")
def test_empty_metric(self) -> None:
"""Test with an empty metric file.

Empty files will not be passed to the exporter
"""
metric_file = "empty-metrics.csv"
metric_file_path = os.path.join(self.snap_common, metric_file)
# $SNAP_COMMON requires root permissions to create a file
subprocess.check_call(f"sudo touch {metric_file_path}".split())
gabrielcocenza marked this conversation as resolved.
Show resolved Hide resolved

self.set_config(self.service, self.config, metric_file)
self.check_metric_config()
_check_endpoint(self.endpoint)

@pytest.mark.usefixtures("metric_setup")
def test_non_existing_metric(self) -> None:
"""Test with a non-existing metric file.

Non-existing files will not be passed to the exporter.
"""
self.set_config(self.service, self.config, "unknown.csv")
self.check_metric_config()
_check_endpoint(self.endpoint)

@pytest.mark.usefixtures("metric_setup")
def test_invalid_metric(self) -> None:
"""Test with an invalid metric file.

The exporter will fail to start due to the invalid metric file
"""
metric_file = "invalid-metrics.csv"
metric_file_path = os.path.join(self.snap_common, metric_file)
# $SNAP_COMMON requires root permissions to create a file
subprocess.check_call(f"echo 'test' | sudo tee {metric_file_path}", shell=True)

self.set_config(self.service, self.config, metric_file)
_check_service_failed(f"snap.{self.service}")

@pytest.mark.usefixtures("metric_setup")
def test_valid_metric(self) -> None:
"""Test with a valid metric file.

The endpoint is reachable with the specified metrics
"""
metric_file = "valid-metrics.csv"
metric_file_path = os.path.join(self.snap_common, metric_file)
subprocess.check_call(
f"echo 'DCGM_FI_DRIVER_VERSION, label, Driver Version' | sudo tee {metric_file_path}",
shell=True,
)

self.set_config(self.service, self.config, metric_file)
self.check_metric_config(metric_file_path)
_check_endpoint(self.endpoint)