From 2892b99489ca8b53b912862d65e3510cad2c35da Mon Sep 17 00:00:00 2001 From: Deezzir Date: Mon, 16 Sep 2024 19:49:24 -0400 Subject: [PATCH 01/21] Add custom metrics configuration option --- snap/hooks/configure | 6 ++++++ snap/hooks/install | 5 +++++ snap/local/run_dcgm_exporter.sh | 8 ++++++++ 3 files changed, 19 insertions(+) create mode 100755 snap/hooks/install diff --git a/snap/hooks/configure b/snap/hooks/configure index 9d5b294..2b20d80 100644 --- a/snap/hooks/configure +++ b/snap/hooks/configure @@ -12,3 +12,9 @@ if [ -z "$(snapctl get dcgm-exporter-address)" ]; then # Explictly use default bind address of dcgm-exporter binary snapctl set dcgm-exporter-address=":9400" fi + +if [ -z "$(snapctl get dcgm-exporter-metrics-file)" ]; then + # Implicitly use default metrics file of dcgm-exporter binary in $SNAP/etc/dcgm-exporter + # https://github.com/NVIDIA/dcgm-exporter?tab=readme-ov-file#changing-metrics + snapctl set dcgm-exporter-metrics-file="" +fi diff --git a/snap/hooks/install b/snap/hooks/install new file mode 100755 index 0000000..3947522 --- /dev/null +++ b/snap/hooks/install @@ -0,0 +1,5 @@ +#!/bin/sh -e + +# Create a folder for custom metric file in $SNAP_COMMON +# for a user to supply custom metrics file +mkdir -p $SNAP_COMMON/etc/dcgm-exporter diff --git a/snap/local/run_dcgm_exporter.sh b/snap/local/run_dcgm_exporter.sh index a1dd641..d0f0f15 100644 --- a/snap/local/run_dcgm_exporter.sh +++ b/snap/local/run_dcgm_exporter.sh @@ -6,9 +6,17 @@ args=() # Add the dcgm-exporter-address option if it is set. Default: “:9400” dcgm_exporter_address="$(snapctl get dcgm-exporter-address)" +# Add the dcgm-exporter-metrics-file option if it is set. +dcgm_exporter_metrics_file="$(snapctl get dcgm-exporter-metrics-file)" +dcgm_exporter_metrics_file_path="$SNAP_COMMON/etc/dcgm-exporter/$dcgm_exporter_metrics_file" if [ -n "$dcgm_exporter_address" ]; then args+=("-a" "$dcgm_exporter_address") fi +# File should be available in the snap data directory under $SNAP_COMMON/etc/dcgm-exporter +if [[ -n "$dcgm_exporter_metrics_file" && -f "$dcgm_exporter_metrics_file_path" ]]; then + args+=("-f" "$dcgm_exporter_metrics_file_path") +fi + exec "$SNAP/bin/dcgm-exporter" "${args[@]}" From c5d26e4430175f404248c14b0983d32fd3a6db47 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 14:31:09 -0400 Subject: [PATCH 02/21] Add test for metrics config, divide tests into 2 classes --- snap/hooks/install | 5 - snap/local/run_dcgm_exporter.sh | 2 +- tests/functional/test_snap_dcgm.py | 175 +++++++++++++++++++---------- tox.ini | 5 +- 4 files changed, 122 insertions(+), 65 deletions(-) delete mode 100755 snap/hooks/install diff --git a/snap/hooks/install b/snap/hooks/install deleted file mode 100755 index 3947522..0000000 --- a/snap/hooks/install +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh -e - -# Create a folder for custom metric file in $SNAP_COMMON -# for a user to supply custom metrics file -mkdir -p $SNAP_COMMON/etc/dcgm-exporter diff --git a/snap/local/run_dcgm_exporter.sh b/snap/local/run_dcgm_exporter.sh index d0f0f15..10f36ac 100644 --- a/snap/local/run_dcgm_exporter.sh +++ b/snap/local/run_dcgm_exporter.sh @@ -8,7 +8,7 @@ args=() dcgm_exporter_address="$(snapctl get dcgm-exporter-address)" # Add the dcgm-exporter-metrics-file option if it is set. dcgm_exporter_metrics_file="$(snapctl get dcgm-exporter-metrics-file)" -dcgm_exporter_metrics_file_path="$SNAP_COMMON/etc/dcgm-exporter/$dcgm_exporter_metrics_file" +dcgm_exporter_metrics_file_path="$SNAP_COMMON/$dcgm_exporter_metrics_file" if [ -n "$dcgm_exporter_address" ]; then args+=("-a" "$dcgm_exporter_address") diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index e9ee0e4..af3ad2b 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -1,4 +1,5 @@ import json +import os import subprocess import urllib.request @@ -6,81 +7,139 @@ from tenacity import Retrying, retry, stop_after_delay, wait_fixed -@retry(wait=wait_fixed(5), stop=stop_after_delay(30)) -def test_dcgm_exporter(): - """Test of the dcgm-exporter service and its endpoint.""" - dcgm_exporter_service = "snap.dcgm.dcgm-exporter" - endpoint = "http://localhost:9400/metrics" +class TestDCGMComponents: + @classmethod + def check_service_active(cls, service: str): + assert 0 == subprocess.call( + f"sudo systemctl is-active --quiet {service}".split() + ), f"{service} is not running" - assert 0 == subprocess.call( - f"sudo systemctl is-active --quiet {dcgm_exporter_service}".split() - ), f"{dcgm_exporter_service} is not running" + @retry(wait=wait_fixed(5), stop=stop_after_delay(30)) + def test_dcgm_exporter(self) -> None: + """Test of the dcgm-exporter service and its endpoint.""" + dcgm_exporter_service = "snap.dcgm.dcgm-exporter" + endpoint = "http://localhost:9400/metrics" - # Check the exporter endpoint, will raise an exception if the endpoint is not reachable - response = urllib.request.urlopen(endpoint) + self.check_service_active(dcgm_exporter_service) - # The output of the exporter endpoint is not tested - # as in a virtual environment it will not have any GPU metrics - assert 200 == response.getcode(), "DCGM exporter endpoint returned an error" + # Will raise an exception if the endpoint is not reachable + response = urllib.request.urlopen(endpoint) + # The output of the exporter endpoint is not tested + # as in a virtual environment it will not have any GPU metrics + assert 200 == response.getcode(), "DCGM exporter endpoint returned an error" -def test_dcgm_nv_hostengine(): - """Check the dcgm-nv-hostengine service.""" - nv_hostengine_service = "snap.dcgm.nv-hostengine" - nv_hostengine_port = 5555 + def test_dcgm_nv_hostengine(self) -> None: + """Check the dcgm-nv-hostengine service.""" + nv_hostengine_service = "snap.dcgm.nv-hostengine" + nv_hostengine_port = 5555 - assert 0 == subprocess.call( - f"sudo systemctl is-active --quiet {nv_hostengine_service}".split() - ), f"{nv_hostengine_service} is not running" + self.check_service_active(nv_hostengine_service) - assert 0 == subprocess.call( - f"nc -z localhost {nv_hostengine_port}".split() - ), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}" + assert 0 == subprocess.call( + f"nc -z localhost {nv_hostengine_port}".split() + ), f"{nv_hostengine_service} is not listening on port {nv_hostengine_port}" + def test_dcgmi(self) -> None: + """Test of the dcgmi command.""" + result = subprocess.check_output("dcgm.dcgmi discovery -l".split(), text=True) -def test_dcgmi(): - """Test of the dcgmi command.""" - result = subprocess.run( - "dcgm.dcgmi discovery -l".split(), check=True, capture_output=True, text=True - ) + # Test if the command is working and outputs a table with the GPU ID + # The table will be empty in a virtual environment, but the command should still work + assert "GPU ID" in result.strip(), "DCGMI didn't produce the expected table" - # Test if the command is working and outputs a table with the GPU ID - # The table will be empty in a virtual environment, but the command should still work - assert "GPU ID" in result.stdout.strip(), "DCGMI didn't produce the expected table" - - -@pytest.mark.parametrize( - "service, config, new_value", - [ - ("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"), - ("dcgm.nv-hostengine", "nv-hostengine-port", "5566"), - ], -) -def test_dcgm_bind_config(service: str, config: str, new_value: str): - """Test snap bind configuration.""" - result = subprocess.run( - "sudo snap get dcgm -d".split(), check=True, capture_output=True, text=True - ) - dcgm_snap_config = json.loads(result.stdout.strip()) - assert config in dcgm_snap_config, f"{config} is not in the snap configuration" - old_value = dcgm_snap_config[config] - def set_config_and_check(value: str): +class TestDCGMConfigs: + @classmethod + def set_config(cls, service: str, config: str, value: str = "") -> None: + """Set a configuration value for a snap service.""" assert 0 == subprocess.call( f"sudo snap set dcgm {config}={value}".split() - ), f"Failed to set {config} to {new_value}" + ), f"Failed to set {config} to {value}" # restart the service to apply the configuration subprocess.run(f"sudo snap restart {service}".split(), check=True) + @classmethod + def check_bind_config(cls, service: str, bind: str) -> None: + """Check if a service is listening on a specific bind.""" for attempt in Retrying(wait=wait_fixed(2), stop=stop_after_delay(10)): with attempt: assert 0 == subprocess.call( - f"nc -z localhost {value.lstrip(':')}".split() - ), f"{service} is not listening on {value}" - - # Check new config - set_config_and_check(new_value) - - # Revert back - set_config_and_check(str(old_value)) + f"nc -z localhost {bind.lstrip(':')}".split() + ), f"{service} is not listening on {bind}" + + @classmethod + def check_config_exists(cls, config: str) -> str: + """Check if a configuration exists in the snap configuration. + + :return: The value of the current configuration + """ + result = subprocess.check_output("sudo snap get dcgm -d".split(), text=True) + dcgm_snap_config = json.loads(result.strip()) + assert config in dcgm_snap_config, f"{config} is not in the snap configuration" + return str(dcgm_snap_config[config]) + + @classmethod + def check_metric_config(cls, metric_file: str) -> None: + dcgm_exporter_service = "snap.dcgm.dcgm-exporter" + + result = subprocess.check_output( + f"sudo systemctl show -p ActiveEnterTimestamp {dcgm_exporter_service}".split(), + text=True, + ) + + start_timestamp = result.strip().split("=")[1] + + result = subprocess.check_output( + f"sudo journalctl -u {dcgm_exporter_service} --since '{start_timestamp}'", + shell=True, + text=True, + ) + + assert metric_file in result, f"Metric file {metric_file} is not loaded" + + @pytest.mark.parametrize( + "service, config, new_value", + [ + ("dcgm.dcgm-exporter", "dcgm-exporter-address", ":9466"), + ("dcgm.nv-hostengine", "nv-hostengine-port", "5566"), + ], + ) + def test_dcgm_bind_config(self, service: str, config: str, new_value: str) -> None: + """Test snap bind configuration.""" + old_value = self.check_config_exists(config) + + self.set_config(service, config, new_value) + self.check_bind_config(service, new_value) + + # Revert back + self.set_config(service, config, old_value) + self.check_bind_config(service, old_value) + + def test_dcgm_metric_config(self) -> None: + service = "dcgm.dcgm-exporter" + config = "dcgm-exporter-metrics-file" + metric_file = "test-metrics.csv" + metric_file_path = os.path.join(os.getenv("SNAP_COMMON"), metric_file) + + self.check_config_exists(config) + + with open(metric_file_path, "w") as f: + print(f"Wrote test metrics to {metric_file_path}") + f.writelines( + [ + "# VGPU License status", + "DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status", + ] + ) + + self.set_config(service, config, metric_file) + self.check_metric_config(metric_file_path) + + # Revet back + self.set_config(service, config) + self.check_metric_config("/etc/dcgm-exporter/default-counters.csv") + + if os.path.exists(metric_file_path): + os.remove(metric_file_path) diff --git a/tox.ini b/tox.ini index fd01577..b4bbc6b 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,9 @@ skip_missing_interpreters = True [testenv] basepython = python3 -setenv = PYTHONPATH={toxinidir} +setenv = + PYTHONPATH={toxinidir} + SNAP_COMMON=/var/snap/dcgm/common [testenv:lint] commands = @@ -38,5 +40,6 @@ deps = -r {toxinidir}/tests/functional/requirements.txt passenv = TEST_* + SNAP_* commands = pytest {toxinidir}/tests/functional {posargs:-v} From 5299662380d60133f7dce69a3dbb2a586382c4d7 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 14:47:43 -0400 Subject: [PATCH 03/21] Fix permissions --- .github/workflows/check.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index 08d17a4..dc0ac8f 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -90,4 +90,4 @@ jobs: python -m pip install 'tox<5' - name: Run unit tests - run: tox -e func + run: sudo tox -e func From fe54104adddbc1aa44b24b2f741fb3ab18abac1b Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 16:04:32 -0400 Subject: [PATCH 04/21] Use an empty test metric file --- .github/workflows/check.yaml | 2 +- tests/functional/test_snap_dcgm.py | 13 +++---------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index dc0ac8f..08d17a4 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -90,4 +90,4 @@ jobs: python -m pip install 'tox<5' - name: Run unit tests - run: sudo tox -e func + run: tox -e func diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index af3ad2b..02e8d3f 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -81,6 +81,7 @@ def check_config_exists(cls, config: str) -> str: return str(dcgm_snap_config[config]) @classmethod + @retry(wait=wait_fixed(2), stop=stop_after_delay(10)) def check_metric_config(cls, metric_file: str) -> None: dcgm_exporter_service = "snap.dcgm.dcgm-exporter" @@ -125,14 +126,7 @@ def test_dcgm_metric_config(self) -> None: self.check_config_exists(config) - with open(metric_file_path, "w") as f: - print(f"Wrote test metrics to {metric_file_path}") - f.writelines( - [ - "# VGPU License status", - "DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status", - ] - ) + subprocess.check_call(f"sudo touch {metric_file_path}".split()) self.set_config(service, config, metric_file) self.check_metric_config(metric_file_path) @@ -141,5 +135,4 @@ def test_dcgm_metric_config(self) -> None: self.set_config(service, config) self.check_metric_config("/etc/dcgm-exporter/default-counters.csv") - if os.path.exists(metric_file_path): - os.remove(metric_file_path) + subprocess.check_call(f"sudo rm {metric_file_path}".split()) From 30da2f3791f3fc2622b2eb7e4859613c46dab4ae Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 16:06:19 -0400 Subject: [PATCH 05/21] Rename --- tests/functional/test_snap_dcgm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 02e8d3f..32a5d43 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -133,6 +133,6 @@ def test_dcgm_metric_config(self) -> None: # Revet back self.set_config(service, config) - self.check_metric_config("/etc/dcgm-exporter/default-counters.csv") + self.check_metric_config("default-counters.csv") subprocess.check_call(f"sudo rm {metric_file_path}".split()) From d9d07d7905f7629acd82d220b0385e9cce93caae Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 17:45:18 -0400 Subject: [PATCH 06/21] Comment refinement --- snap/hooks/configure | 4 ++-- tests/functional/test_snap_dcgm.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/snap/hooks/configure b/snap/hooks/configure index 2b20d80..81dd105 100644 --- a/snap/hooks/configure +++ b/snap/hooks/configure @@ -14,7 +14,7 @@ if [ -z "$(snapctl get dcgm-exporter-address)" ]; then fi if [ -z "$(snapctl get dcgm-exporter-metrics-file)" ]; then - # Implicitly use default metrics file of dcgm-exporter binary in $SNAP/etc/dcgm-exporter - # https://github.com/NVIDIA/dcgm-exporter?tab=readme-ov-file#changing-metrics + # Implicitly use default metrics file of dcgm-exporter binary in $SNAP/etc/dcgm-exporter/default-counters.csv + # See details: https://github.com/NVIDIA/dcgm-exporter?tab=readme-ov-file#changing-metrics snapctl set dcgm-exporter-metrics-file="" fi diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 32a5d43..442e76c 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -70,7 +70,7 @@ def check_bind_config(cls, service: str, bind: str) -> None: ), f"{service} is not listening on {bind}" @classmethod - def check_config_exists(cls, config: str) -> str: + def get_config(cls, config: str) -> str: """Check if a configuration exists in the snap configuration. :return: The value of the current configuration @@ -109,7 +109,7 @@ def check_metric_config(cls, metric_file: str) -> None: ) def test_dcgm_bind_config(self, service: str, config: str, new_value: str) -> None: """Test snap bind configuration.""" - old_value = self.check_config_exists(config) + old_value = self.get_config(config) self.set_config(service, config, new_value) self.check_bind_config(service, new_value) @@ -124,7 +124,7 @@ def test_dcgm_metric_config(self) -> None: metric_file = "test-metrics.csv" metric_file_path = os.path.join(os.getenv("SNAP_COMMON"), metric_file) - self.check_config_exists(config) + self.get_config(config) subprocess.check_call(f"sudo touch {metric_file_path}".split()) From bc7289a05f60aa5536f647390b0d5513baf5d3f3 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 17:57:48 -0400 Subject: [PATCH 07/21] More comment refinement --- snap/local/files/run_dcgm_exporter.sh | 2 +- tests/functional/test_snap_dcgm.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/snap/local/files/run_dcgm_exporter.sh b/snap/local/files/run_dcgm_exporter.sh index 10f36ac..9d1da23 100755 --- a/snap/local/files/run_dcgm_exporter.sh +++ b/snap/local/files/run_dcgm_exporter.sh @@ -14,7 +14,7 @@ if [ -n "$dcgm_exporter_address" ]; then args+=("-a" "$dcgm_exporter_address") fi -# File should be available in the snap data directory under $SNAP_COMMON/etc/dcgm-exporter +# File should be available in the snap data directory under $SNAP_COMMON if [[ -n "$dcgm_exporter_metrics_file" && -f "$dcgm_exporter_metrics_file_path" ]]; then args+=("-f" "$dcgm_exporter_metrics_file_path") fi diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 442e76c..f49400d 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -126,6 +126,7 @@ def test_dcgm_metric_config(self) -> None: self.get_config(config) + # $SNAP_COMMON requires root permissions to create a file subprocess.check_call(f"sudo touch {metric_file_path}".split()) self.set_config(service, config, metric_file) From 547af658d46bd89cf968f78f2fac458a8428ed93 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 20:16:04 -0400 Subject: [PATCH 08/21] Test refinement --- snap/local/files/run_dcgm_exporter.sh | 7 +- tests/functional/test_snap_dcgm.py | 127 ++++++++++++++++++-------- tox.ini | 3 - 3 files changed, 92 insertions(+), 45 deletions(-) diff --git a/snap/local/files/run_dcgm_exporter.sh b/snap/local/files/run_dcgm_exporter.sh index 9d1da23..01410b7 100755 --- a/snap/local/files/run_dcgm_exporter.sh +++ b/snap/local/files/run_dcgm_exporter.sh @@ -4,18 +4,17 @@ set -euo pipefail # Build the argument list for the dcgm-exporter command args=() -# Add the dcgm-exporter-address option if it is set. Default: “:9400” +# Add the dcgm-exporter-address option if it is set. dcgm_exporter_address="$(snapctl get dcgm-exporter-address)" # Add the dcgm-exporter-metrics-file option if it is set. -dcgm_exporter_metrics_file="$(snapctl get dcgm-exporter-metrics-file)" -dcgm_exporter_metrics_file_path="$SNAP_COMMON/$dcgm_exporter_metrics_file" +dcgm_exporter_metrics_file_path="$SNAP_COMMON/$(snapctl get dcgm-exporter-metrics-file)" if [ -n "$dcgm_exporter_address" ]; then args+=("-a" "$dcgm_exporter_address") fi # File should be available in the snap data directory under $SNAP_COMMON -if [[ -n "$dcgm_exporter_metrics_file" && -f "$dcgm_exporter_metrics_file_path" ]]; then +if [[ -f "$dcgm_exporter_metrics_file_path" && -s "$dcgm_exporter_metrics_file_path" ]]; then args+=("-f" "$dcgm_exporter_metrics_file_path") fi diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index f49400d..3caf861 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -4,37 +4,63 @@ import urllib.request import pytest -from tenacity import Retrying, retry, stop_after_delay, wait_fixed +from tenacity import retry, stop_after_delay, wait_fixed -class TestDCGMComponents: - @classmethod - def check_service_active(cls, service: str): - assert 0 == subprocess.call( - f"sudo systemctl is-active --quiet {service}".split() - ), f"{service} is not running" +@retry(wait=wait_fixed(2), stop=stop_after_delay(10)) +def _check_service_active(service: str) -> None: + """Check if a service is active.""" + assert 0 == subprocess.call( + f"sudo systemctl is-active --quiet {service}".split() + ), f"{service} is not running" + + +@retry(wait=wait_fixed(2), stop=stop_after_delay(10)) +def _check_service_failed(service: str) -> None: + """Check if a service is in a failed state.""" + assert 0 == subprocess.call( + f"sudo systemctl is-failed --quiet {service}".split() + ), f"{service} is running" + + +@retry(wait=wait_fixed(5), stop=stop_after_delay(30)) +def _check_endpoint(endpoint: str, reachable: bool = True) -> None: + """Check if an endpoint is reachable. + + :param endpoint: The endpoint to check + :param reachable: If the endpoint should be reachable or not + """ + try: + response = urllib.request.urlopen(endpoint) + status_code = response.getcode() + if reachable: + assert status_code == 200, f"Endpoint {endpoint} returned {status_code}" + else: + raise AssertionError(f"Endpoint {endpoint} is reachable but should not be") + except Exception: + if reachable: + raise AssertionError(f"Endpoint {endpoint} is not reachable") + else: + pass + - @retry(wait=wait_fixed(5), stop=stop_after_delay(30)) +class TestDCGMComponents: def test_dcgm_exporter(self) -> None: """Test of the dcgm-exporter service and its endpoint.""" dcgm_exporter_service = "snap.dcgm.dcgm-exporter" endpoint = "http://localhost:9400/metrics" - self.check_service_active(dcgm_exporter_service) - - # Will raise an exception if the endpoint is not reachable - response = urllib.request.urlopen(endpoint) - + _check_service_active(dcgm_exporter_service) # The output of the exporter endpoint is not tested # as in a virtual environment it will not have any GPU metrics - assert 200 == response.getcode(), "DCGM exporter endpoint returned an error" + _check_endpoint(endpoint) def test_dcgm_nv_hostengine(self) -> None: """Check the dcgm-nv-hostengine service.""" nv_hostengine_service = "snap.dcgm.nv-hostengine" nv_hostengine_port = 5555 - self.check_service_active(nv_hostengine_service) + _check_service_active(nv_hostengine_service) assert 0 == subprocess.call( f"nc -z localhost {nv_hostengine_port}".split() @@ -51,23 +77,22 @@ def test_dcgmi(self) -> None: class TestDCGMConfigs: @classmethod + @retry(wait=wait_fixed(2), stop=stop_after_delay(10)) def set_config(cls, service: str, config: str, value: str = "") -> None: """Set a configuration value for a snap service.""" assert 0 == subprocess.call( f"sudo snap set dcgm {config}={value}".split() ), f"Failed to set {config} to {value}" - # restart the service to apply the configuration subprocess.run(f"sudo snap restart {service}".split(), check=True) @classmethod + @retry(wait=wait_fixed(2), stop=stop_after_delay(10)) def check_bind_config(cls, service: str, bind: str) -> None: """Check if a service is listening on a specific bind.""" - for attempt in Retrying(wait=wait_fixed(2), stop=stop_after_delay(10)): - with attempt: - assert 0 == subprocess.call( - f"nc -z localhost {bind.lstrip(':')}".split() - ), f"{service} is not listening on {bind}" + assert 0 == subprocess.call( + f"nc -z localhost {bind.lstrip(':')}".split() + ), f"{service} is not listening on {bind}" @classmethod def get_config(cls, config: str) -> str: @@ -82,23 +107,19 @@ def get_config(cls, config: str) -> str: @classmethod @retry(wait=wait_fixed(2), stop=stop_after_delay(10)) - def check_metric_config(cls, metric_file: str) -> None: - dcgm_exporter_service = "snap.dcgm.dcgm-exporter" - + def check_metric_config(cls, metric_file: str = "") -> None: + """Check if the metric file is loaded in the dcgm-exporter service. + + :param metric_file: The metric file to check for, if empty the function will check if no metric file is loaded + """ result = subprocess.check_output( - f"sudo systemctl show -p ActiveEnterTimestamp {dcgm_exporter_service}".split(), - text=True, + "ps -eo cmd | grep 'dcgm-exporter'", shell=True, text=True ) - start_timestamp = result.strip().split("=")[1] - - result = subprocess.check_output( - f"sudo journalctl -u {dcgm_exporter_service} --since '{start_timestamp}'", - shell=True, - text=True, - ) - - assert metric_file in result, f"Metric file {metric_file} is not loaded" + if metric_file: + assert f"-f {metric_file}" in result, f"Metric file {metric_file} is not loaded" + else: + assert " -f " not in result, "Metric file is loaded, but should not be" @pytest.mark.parametrize( "service, config, new_value", @@ -111,29 +132,59 @@ def test_dcgm_bind_config(self, service: str, config: str, new_value: str) -> No """Test snap bind configuration.""" old_value = self.get_config(config) + # Valid config self.set_config(service, config, new_value) self.check_bind_config(service, new_value) + # Invalid config + self.set_config(service, config, "test") + _check_service_failed(f"snap.{service}") + # Revert back self.set_config(service, config, old_value) self.check_bind_config(service, old_value) def test_dcgm_metric_config(self) -> None: + """Test the metric file configuration of the dcgm-exporter service.""" service = "dcgm.dcgm-exporter" config = "dcgm-exporter-metrics-file" metric_file = "test-metrics.csv" - metric_file_path = os.path.join(os.getenv("SNAP_COMMON"), metric_file) + endpoint = "http://localhost:9400/metrics" + metric_file_path = os.path.join("/var/snap/dcgm/common", metric_file) self.get_config(config) # $SNAP_COMMON requires root permissions to create a file subprocess.check_call(f"sudo touch {metric_file_path}".split()) + # Empty metric + self.set_config(service, config, metric_file) + self.check_metric_config() + _check_endpoint(endpoint, reachable=True) + + # Non-existing metric + self.set_config(service, config, "unknown.csv") + self.check_metric_config() + _check_endpoint(endpoint, reachable=True) + + # Invalid metric + subprocess.check_call(f"echo 'test' | sudo tee {metric_file_path}", shell=True) + self.set_config(service, config, metric_file) + self.check_metric_config(metric_file_path) + _check_endpoint(endpoint, reachable=False) + + # Valid metric + subprocess.check_call( + f"echo 'DCGM_FI_DRIVER_VERSION, label, Driver Version' | sudo tee {metric_file_path}", + shell=True, + ) self.set_config(service, config, metric_file) self.check_metric_config(metric_file_path) + _check_endpoint(endpoint, reachable=True) - # Revet back + # Revert back self.set_config(service, config) - self.check_metric_config("default-counters.csv") + self.check_metric_config() + _check_endpoint(endpoint, reachable=True) subprocess.check_call(f"sudo rm {metric_file_path}".split()) diff --git a/tox.ini b/tox.ini index b4bbc6b..6efa63f 100644 --- a/tox.ini +++ b/tox.ini @@ -7,8 +7,6 @@ skip_missing_interpreters = True basepython = python3 setenv = PYTHONPATH={toxinidir} - SNAP_COMMON=/var/snap/dcgm/common - [testenv:lint] commands = pflake8 @@ -40,6 +38,5 @@ deps = -r {toxinidir}/tests/functional/requirements.txt passenv = TEST_* - SNAP_* commands = pytest {toxinidir}/tests/functional {posargs:-v} From 21f1316be5d887f77b539f7fa0a5bcd180182f83 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 20:19:01 -0400 Subject: [PATCH 09/21] Fix lint --- tests/functional/test_snap_dcgm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 3caf861..cd87376 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -26,7 +26,7 @@ def _check_service_failed(service: str) -> None: @retry(wait=wait_fixed(5), stop=stop_after_delay(30)) def _check_endpoint(endpoint: str, reachable: bool = True) -> None: """Check if an endpoint is reachable. - + :param endpoint: The endpoint to check :param reachable: If the endpoint should be reachable or not """ @@ -109,8 +109,8 @@ def get_config(cls, config: str) -> str: @retry(wait=wait_fixed(2), stop=stop_after_delay(10)) def check_metric_config(cls, metric_file: str = "") -> None: """Check if the metric file is loaded in the dcgm-exporter service. - - :param metric_file: The metric file to check for, if empty the function will check if no metric file is loaded + + :param metric_file: The metric file to check for, if empty check if nothing is loaded """ result = subprocess.check_output( "ps -eo cmd | grep 'dcgm-exporter'", shell=True, text=True From 5e78cd1796d7d8e05eb26199dd75f57e8f1bfe12 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 20:51:17 -0400 Subject: [PATCH 10/21] Fix check_metric_config --- tests/functional/test_snap_dcgm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index cd87376..2aa29cc 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -113,13 +113,13 @@ def check_metric_config(cls, metric_file: str = "") -> None: :param metric_file: The metric file to check for, if empty check if nothing is loaded """ result = subprocess.check_output( - "ps -eo cmd | grep 'dcgm-exporter'", shell=True, text=True + "ps -eo cmd | grep [d]cgm-exporter", shell=True, text=True ) if metric_file: assert f"-f {metric_file}" in result, f"Metric file {metric_file} is not loaded" else: - assert " -f " not in result, "Metric file is loaded, but should not be" + assert "-f" not in result.split(), "Metric file is loaded, but should not be" @pytest.mark.parametrize( "service, config, new_value", From 3fc4161dd35af96b24111d0bb3f9f31238e2df8c Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 20:54:20 -0400 Subject: [PATCH 11/21] Revert tox.ini --- tox.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 6efa63f..9496580 100644 --- a/tox.ini +++ b/tox.ini @@ -5,8 +5,7 @@ skip_missing_interpreters = True [testenv] basepython = python3 -setenv = - PYTHONPATH={toxinidir} +setenv = PYTHONPATH={toxinidir} [testenv:lint] commands = pflake8 From e4eeba4a604da56bdc614402ca29fd72973504ad Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 21:09:37 -0400 Subject: [PATCH 12/21] Revert tox.ini --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 9496580..fd01577 100644 --- a/tox.ini +++ b/tox.ini @@ -6,6 +6,7 @@ skip_missing_interpreters = True [testenv] basepython = python3 setenv = PYTHONPATH={toxinidir} + [testenv:lint] commands = pflake8 From f9959a2173449c9619db12f1949db56e928f388a Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 21:30:47 -0400 Subject: [PATCH 13/21] Fix ps command --- tests/functional/test_snap_dcgm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 2aa29cc..07e13f8 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -113,7 +113,7 @@ def check_metric_config(cls, metric_file: str = "") -> None: :param metric_file: The metric file to check for, if empty check if nothing is loaded """ result = subprocess.check_output( - "ps -eo cmd | grep [d]cgm-exporter", shell=True, text=True + "ps -eo cmd | grep '/bin/[d]cgm-exporter'", shell=True, text=True ) if metric_file: From 5fb6dea4993c36b0b68ac1df8eeb9c38c57ce9ef Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 21:55:12 -0400 Subject: [PATCH 14/21] Fix invalid metric config test --- tests/functional/test_snap_dcgm.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 07e13f8..0c985ca 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -170,8 +170,8 @@ def test_dcgm_metric_config(self) -> None: # Invalid metric subprocess.check_call(f"echo 'test' | sudo tee {metric_file_path}", shell=True) self.set_config(service, config, metric_file) - self.check_metric_config(metric_file_path) - _check_endpoint(endpoint, reachable=False) + # The exporter will fail to start due to the invalid metric file + _check_service_failed(f"snap.{service}") # Valid metric subprocess.check_call( @@ -185,6 +185,5 @@ def test_dcgm_metric_config(self) -> None: # Revert back self.set_config(service, config) self.check_metric_config() - _check_endpoint(endpoint, reachable=True) subprocess.check_call(f"sudo rm {metric_file_path}".split()) From 1b35d1b7f1187923bc8304df476a8fbf1d1e641a Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 17 Sep 2024 22:04:04 -0400 Subject: [PATCH 15/21] Simplify check_endpoint --- tests/functional/test_snap_dcgm.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 0c985ca..7c8f63f 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -24,24 +24,11 @@ def _check_service_failed(service: str) -> None: @retry(wait=wait_fixed(5), stop=stop_after_delay(30)) -def _check_endpoint(endpoint: str, reachable: bool = True) -> None: - """Check if an endpoint is reachable. - - :param endpoint: The endpoint to check - :param reachable: If the endpoint should be reachable or not - """ - try: - response = urllib.request.urlopen(endpoint) - status_code = response.getcode() - if reachable: - assert status_code == 200, f"Endpoint {endpoint} returned {status_code}" - else: - raise AssertionError(f"Endpoint {endpoint} is reachable but should not be") - except Exception: - if reachable: - raise AssertionError(f"Endpoint {endpoint} is not reachable") - else: - pass +def _check_endpoint(endpoint: str) -> None: + """Check if an endpoint is reachable.""" + response = urllib.request.urlopen(endpoint) # will raise if not reachable + status_code = response.getcode() + assert status_code == 200, f"Endpoint {endpoint} returned status code {status_code}" class TestDCGMComponents: @@ -160,12 +147,12 @@ def test_dcgm_metric_config(self) -> None: # Empty metric self.set_config(service, config, metric_file) self.check_metric_config() - _check_endpoint(endpoint, reachable=True) + _check_endpoint(endpoint) # Non-existing metric self.set_config(service, config, "unknown.csv") self.check_metric_config() - _check_endpoint(endpoint, reachable=True) + _check_endpoint(endpoint) # Invalid metric subprocess.check_call(f"echo 'test' | sudo tee {metric_file_path}", shell=True) @@ -180,7 +167,7 @@ def test_dcgm_metric_config(self) -> None: ) self.set_config(service, config, metric_file) self.check_metric_config(metric_file_path) - _check_endpoint(endpoint, reachable=True) + _check_endpoint(endpoint) # Revert back self.set_config(service, config) From 30bf2896c1ee79773d76f0fadde80d75cec98a6e Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 18 Sep 2024 10:49:29 -0400 Subject: [PATCH 16/21] Refine --- snap/local/files/run_dcgm_exporter.sh | 2 ++ tests/functional/test_snap_dcgm.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/snap/local/files/run_dcgm_exporter.sh b/snap/local/files/run_dcgm_exporter.sh index 01410b7..6352382 100755 --- a/snap/local/files/run_dcgm_exporter.sh +++ b/snap/local/files/run_dcgm_exporter.sh @@ -16,6 +16,8 @@ fi # File should be available in the snap data directory under $SNAP_COMMON if [[ -f "$dcgm_exporter_metrics_file_path" && -s "$dcgm_exporter_metrics_file_path" ]]; then args+=("-f" "$dcgm_exporter_metrics_file_path") +else + echo "Error: DCGM exporter metrics file not found or empty: $dcgm_exporter_metrics_file_path" fi exec "$SNAP/bin/dcgm-exporter" "${args[@]}" diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 7c8f63f..8befe92 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -142,6 +142,7 @@ def test_dcgm_metric_config(self) -> None: self.get_config(config) # $SNAP_COMMON requires root permissions to create a file + # Will be removed during the test cleanup by the fixture subprocess.check_call(f"sudo touch {metric_file_path}".split()) # Empty metric @@ -172,5 +173,3 @@ def test_dcgm_metric_config(self) -> None: # Revert back self.set_config(service, config) self.check_metric_config() - - subprocess.check_call(f"sudo rm {metric_file_path}".split()) From 8b06be80cb8159532304cd99cbbadffd9135f811 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 18 Sep 2024 11:48:17 -0400 Subject: [PATCH 17/21] Update comment --- snap/snapcraft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index 6319608..8ee8c72 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -87,7 +87,7 @@ parts: source: https://github.com/NVIDIA/dcgm-exporter.git source-type: git source-tag: 3.3.7-3.5.0 - # override build to set custom csv file + # override build to get the default csv files from the upstream override-build: | craftctl default mkdir -p $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter From c1f83661c403fd30edf0edb2f4633f6c21d3d678 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 18 Sep 2024 18:52:43 -0400 Subject: [PATCH 18/21] Separate metric test into separate small tests for clarity --- tests/functional/test_snap_dcgm.py | 120 ++++++++++++++++++++--------- 1 file changed, 82 insertions(+), 38 deletions(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 8befe92..330d6d8 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -2,6 +2,7 @@ import os import subprocess import urllib.request +from contextlib import contextmanager import pytest from tenacity import retry, stop_after_delay, wait_fixed @@ -108,6 +109,18 @@ def check_metric_config(cls, metric_file: str = "") -> None: else: assert "-f" not in result.split(), "Metric file is loaded, but should not be" + @contextmanager + def bind_config(self, service, config, new_value): + """Set up a context manager to test snap bind configuration.""" + old_value = self.get_config(config) + try: + self.set_config(service, config, new_value) + yield + finally: + # Revert back + self.set_config(service, config, old_value) + self.check_bind_config(service, old_value) + @pytest.mark.parametrize( "service, config, new_value", [ @@ -115,61 +128,92 @@ def check_metric_config(cls, metric_file: str = "") -> None: ("dcgm.nv-hostengine", "nv-hostengine-port", "5566"), ], ) - def test_dcgm_bind_config(self, service: str, config: str, new_value: str) -> None: - """Test snap bind configuration.""" - old_value = self.get_config(config) + def test_valid_bind_config(self, service: str, config: str, new_value: str) -> None: + """Test valid snap bind configuration.""" + with self.bind_config(service, config, new_value): + self.check_bind_config(service, new_value) + + @pytest.mark.parametrize( + "service, config, new_value", + [ + ("dcgm.dcgm-exporter", "dcgm-exporter-address", "test"), + ("dcgm.nv-hostengine", "nv-hostengine-port", "test"), + ], + ) + def test_invalid_bind_config(self, service: str, config: str, new_value: str) -> None: + """Test invalid snap bind configuration.""" + with self.bind_config(service, config, new_value): + _check_service_failed(f"snap.{service}") + + @classmethod + @pytest.fixture + def metric_setup(cls): + """Fixture for metric configuration tests.""" + cls.service = "dcgm.dcgm-exporter" + cls.config = "dcgm-exporter-metrics-file" + cls.endpoint = "http://localhost:9400/metrics" + cls.snap_common = "/var/snap/dcgm/common" - # Valid config - self.set_config(service, config, new_value) - self.check_bind_config(service, new_value) + cls.get_config(cls.config) - # Invalid config - self.set_config(service, config, "test") - _check_service_failed(f"snap.{service}") + yield # Revert back - self.set_config(service, config, old_value) - self.check_bind_config(service, old_value) - - def test_dcgm_metric_config(self) -> None: - """Test the metric file configuration of the dcgm-exporter service.""" - service = "dcgm.dcgm-exporter" - config = "dcgm-exporter-metrics-file" - metric_file = "test-metrics.csv" - endpoint = "http://localhost:9400/metrics" - metric_file_path = os.path.join("/var/snap/dcgm/common", metric_file) + cls.set_config(cls.service, cls.config) + cls.check_metric_config() - self.get_config(config) + @pytest.mark.usefixtures("metric_setup") + def test_empty_metric(self) -> None: + """Test with an empty metric file. + Empty files will not be passed to the exporter + """ + metric_file = "empty-metrics.csv" + metric_file_path = os.path.join(self.snap_common, metric_file) # $SNAP_COMMON requires root permissions to create a file - # Will be removed during the test cleanup by the fixture subprocess.check_call(f"sudo touch {metric_file_path}".split()) - # Empty metric - self.set_config(service, config, metric_file) + self.set_config(self.service, self.config, metric_file) self.check_metric_config() - _check_endpoint(endpoint) + _check_endpoint(self.endpoint) - # Non-existing metric - self.set_config(service, config, "unknown.csv") + @pytest.mark.usefixtures("metric_setup") + def test_non_existing_metric(self) -> None: + """Test with a non-existing metric file. + + Non-existing files will not be passed to the exporter. + """ + self.set_config(self.service, self.config, "unknown.csv") self.check_metric_config() - _check_endpoint(endpoint) + _check_endpoint(self.endpoint) + + @pytest.mark.usefixtures("metric_setup") + def test_invalid_metric(self) -> None: + """Test with an invalid metric file. - # Invalid metric + The exporter will fail to start due to the invalid metric file + """ + metric_file = "empty-metrics.csv" + metric_file_path = os.path.join(self.snap_common, metric_file) + # $SNAP_COMMON requires root permissions to create a file subprocess.check_call(f"echo 'test' | sudo tee {metric_file_path}", shell=True) - self.set_config(service, config, metric_file) - # The exporter will fail to start due to the invalid metric file - _check_service_failed(f"snap.{service}") - # Valid metric + self.set_config(self.service, self.config, metric_file) + _check_service_failed(f"snap.{self.service}") + + @pytest.mark.usefixtures("metric_setup") + def test_valid_metric(self) -> None: + """Test with a valid metric file. + + The endpoint is reachable with the specified metrics + """ + metric_file = "valid-metrics.csv" + metric_file_path = os.path.join(self.snap_common, metric_file) subprocess.check_call( f"echo 'DCGM_FI_DRIVER_VERSION, label, Driver Version' | sudo tee {metric_file_path}", shell=True, ) - self.set_config(service, config, metric_file) - self.check_metric_config(metric_file_path) - _check_endpoint(endpoint) - # Revert back - self.set_config(service, config) - self.check_metric_config() + self.set_config(self.service, self.config, metric_file) + self.check_metric_config(metric_file_path) + _check_endpoint(self.endpoint) From 4fbb3484a09f17eadfe336a6bb7120789ab0c06c Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 18 Sep 2024 20:17:08 -0400 Subject: [PATCH 19/21] Update metric config error message --- snap/local/files/run_dcgm_exporter.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/snap/local/files/run_dcgm_exporter.sh b/snap/local/files/run_dcgm_exporter.sh index 6352382..ec903ad 100755 --- a/snap/local/files/run_dcgm_exporter.sh +++ b/snap/local/files/run_dcgm_exporter.sh @@ -18,6 +18,7 @@ if [[ -f "$dcgm_exporter_metrics_file_path" && -s "$dcgm_exporter_metrics_file_p args+=("-f" "$dcgm_exporter_metrics_file_path") else echo "Error: DCGM exporter metrics file not found or empty: $dcgm_exporter_metrics_file_path" + echo "DCGM exporter is falling back to the default metrics at $SNAP/etc/nvidia/dcgm-exporter/default-counters.csv" fi exec "$SNAP/bin/dcgm-exporter" "${args[@]}" From 4c09171c817d6dc05598209855614fc964690fb3 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 19 Sep 2024 10:28:05 -0400 Subject: [PATCH 20/21] Refinement --- tests/functional/test_snap_dcgm.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index 330d6d8..e1c5e4a 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -66,7 +66,7 @@ def test_dcgmi(self) -> None: class TestDCGMConfigs: @classmethod @retry(wait=wait_fixed(2), stop=stop_after_delay(10)) - def set_config(cls, service: str, config: str, value: str = "") -> None: + def set_config(cls, service: str, config: str, value: str) -> None: """Set a configuration value for a snap service.""" assert 0 == subprocess.call( f"sudo snap set dcgm {config}={value}".split() @@ -74,6 +74,15 @@ def set_config(cls, service: str, config: str, value: str = "") -> None: subprocess.run(f"sudo snap restart {service}".split(), check=True) + @retry(wait=wait_fixed(2), stop=stop_after_delay(10)) + def unset_config(cls, service: str, config: str) -> None: + """Unset a configuration value for a snap service.""" + assert 0 == subprocess.call( + f"sudo snap unset dcgm {config}".split() + ), f"Failed to unset {config}" + + subprocess.run(f"sudo snap restart {service}".split(), check=True) + @classmethod @retry(wait=wait_fixed(2), stop=stop_after_delay(10)) def check_bind_config(cls, service: str, bind: str) -> None: @@ -101,7 +110,7 @@ def check_metric_config(cls, metric_file: str = "") -> None: :param metric_file: The metric file to check for, if empty check if nothing is loaded """ result = subprocess.check_output( - "ps -eo cmd | grep '/bin/[d]cgm-exporter'", shell=True, text=True + "ps -C dcgm-exporter -o cmd".split(), text=True ) if metric_file: @@ -159,7 +168,7 @@ def metric_setup(cls): yield # Revert back - cls.set_config(cls.service, cls.config) + cls.unset_config(cls.service, cls.config) cls.check_metric_config() @pytest.mark.usefixtures("metric_setup") @@ -193,7 +202,7 @@ def test_invalid_metric(self) -> None: The exporter will fail to start due to the invalid metric file """ - metric_file = "empty-metrics.csv" + metric_file = "invalid-metrics.csv" metric_file_path = os.path.join(self.snap_common, metric_file) # $SNAP_COMMON requires root permissions to create a file subprocess.check_call(f"echo 'test' | sudo tee {metric_file_path}", shell=True) From a64a6d0c88c775890fb92d1fad87f9ba0ad0c99b Mon Sep 17 00:00:00 2001 From: Deezzir Date: Thu, 19 Sep 2024 10:33:06 -0400 Subject: [PATCH 21/21] Reformat --- tests/functional/test_snap_dcgm.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/functional/test_snap_dcgm.py b/tests/functional/test_snap_dcgm.py index e1c5e4a..91ef142 100644 --- a/tests/functional/test_snap_dcgm.py +++ b/tests/functional/test_snap_dcgm.py @@ -72,8 +72,9 @@ def set_config(cls, service: str, config: str, value: str) -> None: f"sudo snap set dcgm {config}={value}".split() ), f"Failed to set {config} to {value}" - subprocess.run(f"sudo snap restart {service}".split(), check=True) + subprocess.check_call(f"sudo snap restart {service}".split()) + @classmethod @retry(wait=wait_fixed(2), stop=stop_after_delay(10)) def unset_config(cls, service: str, config: str) -> None: """Unset a configuration value for a snap service.""" @@ -81,7 +82,7 @@ def unset_config(cls, service: str, config: str) -> None: f"sudo snap unset dcgm {config}".split() ), f"Failed to unset {config}" - subprocess.run(f"sudo snap restart {service}".split(), check=True) + subprocess.check_call(f"sudo snap restart {service}".split()) @classmethod @retry(wait=wait_fixed(2), stop=stop_after_delay(10)) @@ -109,9 +110,7 @@ def check_metric_config(cls, metric_file: str = "") -> None: :param metric_file: The metric file to check for, if empty check if nothing is loaded """ - result = subprocess.check_output( - "ps -C dcgm-exporter -o cmd".split(), text=True - ) + result = subprocess.check_output("ps -C dcgm-exporter -o cmd".split(), text=True) if metric_file: assert f"-f {metric_file}" in result, f"Metric file {metric_file} is not loaded"