Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom metrics config option #20

Merged
merged 24 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions snap/hooks/configure
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,9 @@ if [ -z "$(snapctl get dcgm-exporter-address)" ]; then
# Explictly use default bind address of dcgm-exporter binary
snapctl set dcgm-exporter-address=":9400"
fi

if [ -z "$(snapctl get dcgm-exporter-metrics-file)" ]; then
# Implicitly use default metrics file of dcgm-exporter binary in $SNAP/etc/dcgm-exporter/default-counters.csv
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
# See details: https://github.com/NVIDIA/dcgm-exporter?tab=readme-ov-file#changing-metrics
snapctl set dcgm-exporter-metrics-file=""
fi
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
9 changes: 8 additions & 1 deletion snap/local/files/run_dcgm_exporter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,18 @@ set -euo pipefail
# Build the argument list for the dcgm-exporter command
args=()

# Add the dcgm-exporter-address option if it is set. Default: “:9400”
# Add the dcgm-exporter-address option if it is set.
dcgm_exporter_address="$(snapctl get dcgm-exporter-address)"
# Add the dcgm-exporter-metrics-file option if it is set.
dcgm_exporter_metrics_file_path="$SNAP_COMMON/$(snapctl get dcgm-exporter-metrics-file)"

if [ -n "$dcgm_exporter_address" ]; then
args+=("-a" "$dcgm_exporter_address")
fi

# File should be available in the snap data directory under $SNAP_COMMON
if [[ -f "$dcgm_exporter_metrics_file_path" && -s "$dcgm_exporter_metrics_file_path" ]]; then
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
args+=("-f" "$dcgm_exporter_metrics_file_path")
fi

exec "$SNAP/bin/dcgm-exporter" "${args[@]}"
214 changes: 152 additions & 62 deletions tests/functional/test_snap_dcgm.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,176 @@
import json
import os
import subprocess
import urllib.request

import pytest
from tenacity import Retrying, retry, stop_after_delay, wait_fixed
from tenacity import retry, stop_after_delay, wait_fixed


@retry(wait=wait_fixed(5), stop=stop_after_delay(30))
def test_dcgm_exporter():
"""Test of the dcgm-exporter service and its endpoint."""
dcgm_exporter_service = "snap.dcgm.dcgm-exporter"
endpoint = "http://localhost:9400/metrics"
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def _check_service_active(service: str) -> None:
"""Check if a service is active."""
assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {service}".split()
), f"{service} is not running"


@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def _check_service_failed(service: str) -> None:
"""Check if a service is in a failed state."""
assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {dcgm_exporter_service}".split()
), f"{dcgm_exporter_service} is not running"
f"sudo systemctl is-failed --quiet {service}".split()
), f"{service} is running"

# Check the exporter endpoint, will raise an exception if the endpoint is not reachable
response = urllib.request.urlopen(endpoint)

# The output of the exporter endpoint is not tested
# as in a virtual environment it will not have any GPU metrics
assert 200 == response.getcode(), "DCGM exporter endpoint returned an error"
@retry(wait=wait_fixed(5), stop=stop_after_delay(30))
def _check_endpoint(endpoint: str) -> None:
"""Check if an endpoint is reachable."""
response = urllib.request.urlopen(endpoint) # will raise if not reachable
status_code = response.getcode()
assert status_code == 200, f"Endpoint {endpoint} returned status code {status_code}"


def test_dcgm_nv_hostengine():
"""Check the dcgm-nv-hostengine service."""
nv_hostengine_service = "snap.dcgm.nv-hostengine"
nv_hostengine_port = 5555
class TestDCGMComponents:
def test_dcgm_exporter(self) -> None:
"""Test of the dcgm-exporter service and its endpoint."""
dcgm_exporter_service = "snap.dcgm.dcgm-exporter"
endpoint = "http://localhost:9400/metrics"

assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {nv_hostengine_service}".split()
), f"{nv_hostengine_service} is not running"
_check_service_active(dcgm_exporter_service)
# The output of the exporter endpoint is not tested
# as in a virtual environment it will not have any GPU metrics
_check_endpoint(endpoint)

assert 0 == subprocess.call(
f"nc -z localhost {nv_hostengine_port}".split()
), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}"
def test_dcgm_nv_hostengine(self) -> None:
"""Check the dcgm-nv-hostengine service."""
nv_hostengine_service = "snap.dcgm.nv-hostengine"
nv_hostengine_port = 5555

_check_service_active(nv_hostengine_service)

def test_dcgmi():
"""Test of the dcgmi command."""
result = subprocess.run(
"dcgm.dcgmi discovery -l".split(), check=True, capture_output=True, text=True
)
assert 0 == subprocess.call(
f"nc -z localhost {nv_hostengine_port}".split()
), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}"

# Test if the command is working and outputs a table with the GPU ID
# The table will be empty in a virtual environment, but the command should still work
assert "GPU ID" in result.stdout.strip(), "DCGMI didn't produce the expected table"


@pytest.mark.parametrize(
"service, config, new_value",
[
("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"),
("dcgm.nv-hostengine", "nv-hostengine-port", "5566"),
],
)
def test_dcgm_bind_config(service: str, config: str, new_value: str):
"""Test snap bind configuration."""
result = subprocess.run(
"sudo snap get dcgm -d".split(), check=True, capture_output=True, text=True
)
dcgm_snap_config = json.loads(result.stdout.strip())
assert config in dcgm_snap_config, f"{config} is not in the snap configuration"
old_value = dcgm_snap_config[config]
def test_dcgmi(self) -> None:
"""Test of the dcgmi command."""
result = subprocess.check_output("dcgm.dcgmi discovery -l".split(), text=True)

# Test if the command is working and outputs a table with the GPU ID
# The table will be empty in a virtual environment, but the command should still work
assert "GPU ID" in result.strip(), "DCGMI didn't produce the expected table"

def set_config_and_check(value: str):

class TestDCGMConfigs:
@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def set_config(cls, service: str, config: str, value: str = "") -> None:
"""Set a configuration value for a snap service."""
assert 0 == subprocess.call(
f"sudo snap set dcgm {config}={value}".split()
), f"Failed to set {config} to {new_value}"
), f"Failed to set {config} to {value}"

# restart the service to apply the configuration
subprocess.run(f"sudo snap restart {service}".split(), check=True)

for attempt in Retrying(wait=wait_fixed(2), stop=stop_after_delay(10)):
with attempt:
assert 0 == subprocess.call(
f"nc -z localhost {value.lstrip(':')}".split()
), f"{service} is not listening on {value}"

# Check new config
set_config_and_check(new_value)

# Revert back
set_config_and_check(str(old_value))
@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def check_bind_config(cls, service: str, bind: str) -> None:
"""Check if a service is listening on a specific bind."""
assert 0 == subprocess.call(
f"nc -z localhost {bind.lstrip(':')}".split()
), f"{service} is not listening on {bind}"

@classmethod
def get_config(cls, config: str) -> str:
"""Check if a configuration exists in the snap configuration.

:return: The value of the current configuration
"""
result = subprocess.check_output("sudo snap get dcgm -d".split(), text=True)
dcgm_snap_config = json.loads(result.strip())
assert config in dcgm_snap_config, f"{config} is not in the snap configuration"
return str(dcgm_snap_config[config])

@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def check_metric_config(cls, metric_file: str = "") -> None:
"""Check if the metric file is loaded in the dcgm-exporter service.

:param metric_file: The metric file to check for, if empty check if nothing is loaded
"""
result = subprocess.check_output(
"ps -eo cmd | grep '/bin/[d]cgm-exporter'", shell=True, text=True
gabrielcocenza marked this conversation as resolved.
Show resolved Hide resolved
)

if metric_file:
assert f"-f {metric_file}" in result, f"Metric file {metric_file} is not loaded"
else:
assert "-f" not in result.split(), "Metric file is loaded, but should not be"

@pytest.mark.parametrize(
"service, config, new_value",
[
("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"),
("dcgm.nv-hostengine", "nv-hostengine-port", "5566"),
],
)
def test_dcgm_bind_config(self, service: str, config: str, new_value: str) -> None:
"""Test snap bind configuration."""
old_value = self.get_config(config)

# Valid config
self.set_config(service, config, new_value)
self.check_bind_config(service, new_value)

# Invalid config
self.set_config(service, config, "test")
_check_service_failed(f"snap.{service}")
aieri marked this conversation as resolved.
Show resolved Hide resolved

# Revert back
self.set_config(service, config, old_value)
self.check_bind_config(service, old_value)
aieri marked this conversation as resolved.
Show resolved Hide resolved

def test_dcgm_metric_config(self) -> None:
"""Test the metric file configuration of the dcgm-exporter service."""
service = "dcgm.dcgm-exporter"
config = "dcgm-exporter-metrics-file"
metric_file = "test-metrics.csv"
endpoint = "http://localhost:9400/metrics"
metric_file_path = os.path.join("/var/snap/dcgm/common", metric_file)

self.get_config(config)

# $SNAP_COMMON requires root permissions to create a file
subprocess.check_call(f"sudo touch {metric_file_path}".split())
gabrielcocenza marked this conversation as resolved.
Show resolved Hide resolved

# Empty metric
self.set_config(service, config, metric_file)
self.check_metric_config()
_check_endpoint(endpoint)

# Non-existing metric
self.set_config(service, config, "unknown.csv")
self.check_metric_config()
_check_endpoint(endpoint)

# Invalid metric
subprocess.check_call(f"echo 'test' | sudo tee {metric_file_path}", shell=True)
self.set_config(service, config, metric_file)
# The exporter will fail to start due to the invalid metric file
_check_service_failed(f"snap.{service}")

# Valid metric
subprocess.check_call(
f"echo 'DCGM_FI_DRIVER_VERSION, label, Driver Version' | sudo tee {metric_file_path}",
shell=True,
)
self.set_config(service, config, metric_file)
self.check_metric_config(metric_file_path)
_check_endpoint(endpoint)

# Revert back
self.set_config(service, config)
self.check_metric_config()
aieri marked this conversation as resolved.
Show resolved Hide resolved

subprocess.check_call(f"sudo rm {metric_file_path}".split())
aieri marked this conversation as resolved.
Show resolved Hide resolved