Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom metrics config option #20

Merged
merged 24 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions snap/hooks/configure
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,9 @@ if [ -z "$(snapctl get dcgm-exporter-address)" ]; then
# Explictly use default bind address of dcgm-exporter binary
snapctl set dcgm-exporter-address=":9400"
fi

if [ -z "$(snapctl get dcgm-exporter-metrics-file)" ]; then
# Implicitly use default metrics file of dcgm-exporter binary in $SNAP/etc/dcgm-exporter/default-counters.csv
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
# See details: https://github.com/NVIDIA/dcgm-exporter?tab=readme-ov-file#changing-metrics
snapctl set dcgm-exporter-metrics-file=""
fi
samuelallan72 marked this conversation as resolved.
Show resolved Hide resolved
8 changes: 8 additions & 0 deletions snap/local/files/run_dcgm_exporter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,17 @@ args=()

# Add the dcgm-exporter-address option if it is set. Default: “:9400”
dcgm_exporter_address="$(snapctl get dcgm-exporter-address)"
# Add the dcgm-exporter-metrics-file option if it is set.
dcgm_exporter_metrics_file="$(snapctl get dcgm-exporter-metrics-file)"
dcgm_exporter_metrics_file_path="$SNAP_COMMON/$dcgm_exporter_metrics_file"

if [ -n "$dcgm_exporter_address" ]; then
args+=("-a" "$dcgm_exporter_address")
fi

# File should be available in the snap data directory under $SNAP_COMMON
if [[ -n "$dcgm_exporter_metrics_file" && -f "$dcgm_exporter_metrics_file_path" ]]; then
gabrielcocenza marked this conversation as resolved.
Show resolved Hide resolved
args+=("-f" "$dcgm_exporter_metrics_file_path")
fi

exec "$SNAP/bin/dcgm-exporter" "${args[@]}"
165 changes: 109 additions & 56 deletions tests/functional/test_snap_dcgm.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,139 @@
import json
import os
import subprocess
import urllib.request

import pytest
from tenacity import Retrying, retry, stop_after_delay, wait_fixed


@retry(wait=wait_fixed(5), stop=stop_after_delay(30))
def test_dcgm_exporter():
"""Test of the dcgm-exporter service and its endpoint."""
dcgm_exporter_service = "snap.dcgm.dcgm-exporter"
endpoint = "http://localhost:9400/metrics"
class TestDCGMComponents:
@classmethod
def check_service_active(cls, service: str):
assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {service}".split()
), f"{service} is not running"

assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {dcgm_exporter_service}".split()
), f"{dcgm_exporter_service} is not running"
@retry(wait=wait_fixed(5), stop=stop_after_delay(30))
def test_dcgm_exporter(self) -> None:
"""Test of the dcgm-exporter service and its endpoint."""
dcgm_exporter_service = "snap.dcgm.dcgm-exporter"
endpoint = "http://localhost:9400/metrics"

# Check the exporter endpoint, will raise an exception if the endpoint is not reachable
response = urllib.request.urlopen(endpoint)
self.check_service_active(dcgm_exporter_service)

# The output of the exporter endpoint is not tested
# as in a virtual environment it will not have any GPU metrics
assert 200 == response.getcode(), "DCGM exporter endpoint returned an error"
# Will raise an exception if the endpoint is not reachable
response = urllib.request.urlopen(endpoint)

# The output of the exporter endpoint is not tested
# as in a virtual environment it will not have any GPU metrics
assert 200 == response.getcode(), "DCGM exporter endpoint returned an error"

def test_dcgm_nv_hostengine():
"""Check the dcgm-nv-hostengine service."""
nv_hostengine_service = "snap.dcgm.nv-hostengine"
nv_hostengine_port = 5555
def test_dcgm_nv_hostengine(self) -> None:
"""Check the dcgm-nv-hostengine service."""
nv_hostengine_service = "snap.dcgm.nv-hostengine"
nv_hostengine_port = 5555

assert 0 == subprocess.call(
f"sudo systemctl is-active --quiet {nv_hostengine_service}".split()
), f"{nv_hostengine_service} is not running"
self.check_service_active(nv_hostengine_service)

assert 0 == subprocess.call(
f"nc -z localhost {nv_hostengine_port}".split()
), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}"
assert 0 == subprocess.call(
f"nc -z localhost {nv_hostengine_port}".split()
), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}"

def test_dcgmi(self) -> None:
"""Test of the dcgmi command."""
result = subprocess.check_output("dcgm.dcgmi discovery -l".split(), text=True)

def test_dcgmi():
"""Test of the dcgmi command."""
result = subprocess.run(
"dcgm.dcgmi discovery -l".split(), check=True, capture_output=True, text=True
)
# Test if the command is working and outputs a table with the GPU ID
# The table will be empty in a virtual environment, but the command should still work
assert "GPU ID" in result.strip(), "DCGMI didn't produce the expected table"

# Test if the command is working and outputs a table with the GPU ID
# The table will be empty in a virtual environment, but the command should still work
assert "GPU ID" in result.stdout.strip(), "DCGMI didn't produce the expected table"


@pytest.mark.parametrize(
"service, config, new_value",
[
("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"),
("dcgm.nv-hostengine", "nv-hostengine-port", "5566"),
],
)
def test_dcgm_bind_config(service: str, config: str, new_value: str):
"""Test snap bind configuration."""
result = subprocess.run(
"sudo snap get dcgm -d".split(), check=True, capture_output=True, text=True
)
dcgm_snap_config = json.loads(result.stdout.strip())
assert config in dcgm_snap_config, f"{config} is not in the snap configuration"
old_value = dcgm_snap_config[config]

def set_config_and_check(value: str):
class TestDCGMConfigs:
@classmethod
def set_config(cls, service: str, config: str, value: str = "") -> None:
"""Set a configuration value for a snap service."""
assert 0 == subprocess.call(
f"sudo snap set dcgm {config}={value}".split()
), f"Failed to set {config} to {new_value}"
), f"Failed to set {config} to {value}"

# restart the service to apply the configuration
subprocess.run(f"sudo snap restart {service}".split(), check=True)

@classmethod
def check_bind_config(cls, service: str, bind: str) -> None:
"""Check if a service is listening on a specific bind."""
for attempt in Retrying(wait=wait_fixed(2), stop=stop_after_delay(10)):
with attempt:
assert 0 == subprocess.call(
f"nc -z localhost {value.lstrip(':')}".split()
), f"{service} is not listening on {value}"
f"nc -z localhost {bind.lstrip(':')}".split()
), f"{service} is not listening on {bind}"

@classmethod
def get_config(cls, config: str) -> str:
"""Check if a configuration exists in the snap configuration.

:return: The value of the current configuration
"""
result = subprocess.check_output("sudo snap get dcgm -d".split(), text=True)
dcgm_snap_config = json.loads(result.strip())
assert config in dcgm_snap_config, f"{config} is not in the snap configuration"
return str(dcgm_snap_config[config])

@classmethod
@retry(wait=wait_fixed(2), stop=stop_after_delay(10))
def check_metric_config(cls, metric_file: str) -> None:
dcgm_exporter_service = "snap.dcgm.dcgm-exporter"

result = subprocess.check_output(
f"sudo systemctl show -p ActiveEnterTimestamp {dcgm_exporter_service}".split(),
text=True,
)

start_timestamp = result.strip().split("=")[1]

result = subprocess.check_output(
f"sudo journalctl -u {dcgm_exporter_service} --since '{start_timestamp}'",
shell=True,
text=True,
)

assert metric_file in result, f"Metric file {metric_file} is not loaded"
Deezzir marked this conversation as resolved.
Show resolved Hide resolved

@pytest.mark.parametrize(
"service, config, new_value",
[
("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"),
("dcgm.nv-hostengine", "nv-hostengine-port", "5566"),
],
)
def test_dcgm_bind_config(self, service: str, config: str, new_value: str) -> None:
"""Test snap bind configuration."""
old_value = self.get_config(config)

self.set_config(service, config, new_value)
self.check_bind_config(service, new_value)

# Revert back
self.set_config(service, config, old_value)
self.check_bind_config(service, old_value)
aieri marked this conversation as resolved.
Show resolved Hide resolved

def test_dcgm_metric_config(self) -> None:
service = "dcgm.dcgm-exporter"
config = "dcgm-exporter-metrics-file"
metric_file = "test-metrics.csv"
metric_file_path = os.path.join(os.getenv("SNAP_COMMON"), metric_file)
aieri marked this conversation as resolved.
Show resolved Hide resolved

self.get_config(config)

# $SNAP_COMMON requires root permissions to create a file
subprocess.check_call(f"sudo touch {metric_file_path}".split())
gabrielcocenza marked this conversation as resolved.
Show resolved Hide resolved

self.set_config(service, config, metric_file)
self.check_metric_config(metric_file_path)

# Check new config
set_config_and_check(new_value)
# Revet back
aieri marked this conversation as resolved.
Show resolved Hide resolved
self.set_config(service, config)
self.check_metric_config("default-counters.csv")

# Revert back
set_config_and_check(str(old_value))
subprocess.check_call(f"sudo rm {metric_file_path}".split())
aieri marked this conversation as resolved.
Show resolved Hide resolved
5 changes: 4 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ skip_missing_interpreters = True

[testenv]
basepython = python3
setenv = PYTHONPATH={toxinidir}
setenv =
PYTHONPATH={toxinidir}
SNAP_COMMON=/var/snap/dcgm/common

[testenv:lint]
commands =
Expand Down Expand Up @@ -38,5 +40,6 @@ deps =
-r {toxinidir}/tests/functional/requirements.txt
passenv =
TEST_*
SNAP_*
commands =
pytest {toxinidir}/tests/functional {posargs:-v}