From e016bbba4acd85c920a7efdc6d82893992df4c65 Mon Sep 17 00:00:00 2001 From: Daniel Swarbrick Date: Tue, 19 Nov 2024 17:57:37 +0100 Subject: [PATCH 1/6] nvme_metrics: change nvmecli metric type to Info This is a breaking change, as it renames the metric nvme_nvmecli to nvme_nvmecli_info. Signed-off-by: Daniel Swarbrick --- nvme_metrics.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/nvme_metrics.py b/nvme_metrics.py index c7dbaec..f6e3a6c 100755 --- a/nvme_metrics.py +++ b/nvme_metrics.py @@ -74,10 +74,7 @@ "Device error log entry count", ["device"], namespace=namespace, registry=registry, ), - # FIXME: The "nvmecli" metric ought to be an Info type, not a Gauge. However, making this change - # will result in the metric having a "_info" suffix automatically appended, which is arguably - # a breaking change. - "nvmecli": Gauge( + "nvmecli": Info( "nvmecli", "nvme-cli tool information", ["version"], namespace=namespace, registry=registry, @@ -159,7 +156,7 @@ def main(): cli_version = match.group(1) else: cli_version = "unknown" - metrics["nvmecli"].labels(cli_version).set(1) + metrics["nvmecli"].labels(cli_version) device_list = exec_nvme_json("list") From 4e2c7e1d6e084645c073d8a55fd0574769e3803b Mon Sep 17 00:00:00 2001 From: Daniel Swarbrick Date: Tue, 19 Nov 2024 18:09:52 +0100 Subject: [PATCH 2/6] nvme_metrics: refactor device_info metric as controller_info This is a breaking change, as it renames the existing metric nvme_device_info to nvme_controller_info. The previous "device" label is now "controller", and takes the form of e.g. "nvme0" instead of "nvme0n1". A new label "transport" is also added to the renamed metric. Signed-off-by: Daniel Swarbrick --- nvme_metrics.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/nvme_metrics.py b/nvme_metrics.py index f6e3a6c..f148750 100755 --- a/nvme_metrics.py +++ b/nvme_metrics.py @@ -34,6 +34,12 @@ "Device controller busy time in seconds", ["device"], namespace=namespace, registry=registry, ), + "controller_info": Info( + "controller", + "Controller information", + ["controller", "model", "firmware", "serial", "transport"], namespace=namespace, + registry=registry, + ), "critical_warning": Gauge( "critical_warning", "Device critical warning bitmap field", @@ -49,11 +55,6 @@ "Number of 512-byte data units written by host, reported in thousands", ["device"], namespace=namespace, registry=registry, ), - "device_info": Info( - "device", - "Device information", - ["device", "model", "firmware", "serial"], namespace=namespace, registry=registry, - ), "host_read_commands": Counter( "host_read_commands_total", "Device read commands from host", @@ -163,20 +164,17 @@ def main(): for device in device_list["Devices"]: for subsys in device["Subsystems"]: for ctrl in subsys["Controllers"]: + metrics["controller_info"].labels( + ctrl["Controller"], + ctrl["ModelNumber"], + ctrl["Firmware"], + ctrl["SerialNumber"].strip(), + ctrl["Transport"], + ) + for ns in ctrl["Namespaces"]: device_name = ns["NameSpace"] - # FIXME: This metric ought to be refactored into a "controller_info" metric, - # since it contains information that is not unique to the namespace. However, - # previous versions of this collector erroneously referred to namespaces, e.g. - # "nvme0n1", as devices, so preserve the former behaviour for now. - metrics["device_info"].labels( - device_name, - ctrl["ModelNumber"], - ctrl["Firmware"], - ctrl["SerialNumber"].strip(), - ) - metrics["sector_size"].labels(device_name).set(ns["SectorSize"]) metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"]) metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"]) From da956a5a3c55579846ba806101c358edf33ec9fc Mon Sep 17 00:00:00 2001 From: Daniel Swarbrick Date: Thu, 21 Nov 2024 07:24:43 +0100 Subject: [PATCH 3/6] nvme_metrics: fetch device global SMART log once per controller Change "device" label in controller-specific metrics to "controller". This also means that the label value will be the NVMe character device name, e.g. "nvme0", instead of the previously used namespace block device name, e.g. "nvme0n1". Separate metric declaration dict into controller-specific and namespace-specific groups for easier maintenance. Signed-off-by: Daniel Swarbrick --- nvme_metrics.py | 147 ++++++++++++++++++++++++------------------------ 1 file changed, 73 insertions(+), 74 deletions(-) diff --git a/nvme_metrics.py b/nvme_metrics.py index f148750..08e3e6f 100755 --- a/nvme_metrics.py +++ b/nvme_metrics.py @@ -24,15 +24,22 @@ metrics = { # fmt: off + "nvmecli": Info( + "nvmecli", + "nvme-cli tool information", + ["version"], namespace=namespace, registry=registry, + ), + + # Controller-specific (e.g. "nvme0") metrics "avail_spare": Gauge( "available_spare_ratio", "Device available spare ratio", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "controller_busy_time": Counter( "controller_busy_time_seconds", "Device controller busy time in seconds", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "controller_info": Info( "controller", @@ -43,81 +50,78 @@ "critical_warning": Gauge( "critical_warning", "Device critical warning bitmap field", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "data_units_read": Counter( "data_units_read_total", "Number of 512-byte data units read by host, reported in thousands", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "data_units_written": Counter( "data_units_written_total", "Number of 512-byte data units written by host, reported in thousands", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "host_read_commands": Counter( "host_read_commands_total", "Device read commands from host", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "host_write_commands": Counter( "host_write_commands_total", "Device write commands from host", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "media_errors": Counter( "media_errors_total", "Device media errors total", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "num_err_log_entries": Counter( "num_err_log_entries_total", "Device error log entry count", - ["device"], namespace=namespace, registry=registry, - ), - "nvmecli": Info( - "nvmecli", - "nvme-cli tool information", - ["version"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "percent_used": Gauge( "percentage_used_ratio", "Device percentage used ratio", - ["device"], namespace=namespace, registry=registry, - ), - "physical_size": Gauge( - "physical_size_bytes", - "Device size in bytes", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "power_cycles": Counter( "power_cycles_total", "Device number of power cycles", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "power_on_hours": Counter( "power_on_hours_total", "Device power-on hours", - ["device"], namespace=namespace, registry=registry, - ), - "sector_size": Gauge( - "sector_size_bytes", - "Device sector size in bytes", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "spare_thresh": Gauge( "available_spare_threshold_ratio", "Device available spare threshold ratio", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "temperature": Gauge( "temperature_celsius", "Device temperature in degrees Celsius", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "unsafe_shutdowns": Counter( "unsafe_shutdowns_total", "Device number of unsafe shutdowns", + ["controller"], namespace=namespace, registry=registry, + ), + + # Namespace-specific (e.g. "nvme0n1") metrics + "physical_size": Gauge( + "physical_size_bytes", + "Device size in bytes", + ["device"], namespace=namespace, registry=registry, + ), + "sector_size": Gauge( + "sector_size_bytes", + "Device sector size in bytes", ["device"], namespace=namespace, registry=registry, ), "used_bytes": Gauge( @@ -164,8 +168,10 @@ def main(): for device in device_list["Devices"]: for subsys in device["Subsystems"]: for ctrl in subsys["Controllers"]: + ctrl_dev = ctrl["Controller"] + metrics["controller_info"].labels( - ctrl["Controller"], + ctrl_dev, ctrl["ModelNumber"], ctrl["Firmware"], ctrl["SerialNumber"].strip(), @@ -179,50 +185,43 @@ def main(): metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"]) metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"]) - # FIXME: The smart-log should only need to be fetched once per controller, not - # per namespace. However, in order to preserve legacy metric labels, fetch it - # per namespace anyway. Most consumer grade SSDs will only have one namespace. - smart_log = exec_nvme_json("smart-log", os.path.join("/dev", device_name)) - - # Various counters in the NVMe specification are 128-bit, which would have to - # discard resolution if converted to a JSON number (i.e., float64_t). Instead, - # nvme-cli marshals them as strings. As such, they need to be explicitly cast - # to int or float when using them in Counter metrics. - metrics["data_units_read"].labels(device_name).inc( - int(smart_log["data_units_read"]) - ) - metrics["data_units_written"].labels(device_name).inc( - int(smart_log["data_units_written"]) - ) - metrics["host_read_commands"].labels(device_name).inc( - int(smart_log["host_read_commands"]) - ) - metrics["host_write_commands"].labels(device_name).inc( - int(smart_log["host_write_commands"]) - ) - metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100) - metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100) - metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100) - metrics["critical_warning"].labels(device_name).set( - smart_log["critical_warning"]["value"] - ) - metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"])) - metrics["num_err_log_entries"].labels(device_name).inc( - int(smart_log["num_err_log_entries"]) - ) - metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"])) - metrics["power_on_hours"].labels(device_name).inc( - int(smart_log["power_on_hours"]) - ) - metrics["controller_busy_time"].labels(device_name).inc( - int(smart_log["controller_busy_time"]) - ) - metrics["unsafe_shutdowns"].labels(device_name).inc( - int(smart_log["unsafe_shutdowns"]) - ) - - # NVMe reports temperature in kelvins; convert it to degrees Celsius. - metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273) + # Most SSDs (perhaps _all_ consumer grade SSDs) only contain a single namespace. + # Fetch the device global SMART log by omitting any --namespace-id flag. + smart_log = exec_nvme_json("smart-log", os.path.join("/dev", ctrl["Controller"])) + + # Various counters in the NVMe specification are 128-bit, which would have to + # discard resolution if converted to a JSON number (i.e., float64_t). Instead, + # nvme-cli marshals them as strings. As such, they need to be explicitly cast to int + # or float when using them in Counter metrics. + metrics["data_units_read"].labels(ctrl_dev).inc(int(smart_log["data_units_read"])) + metrics["data_units_written"].labels(ctrl_dev).inc( + int(smart_log["data_units_written"]) + ) + metrics["host_read_commands"].labels(ctrl_dev).inc( + int(smart_log["host_read_commands"]) + ) + metrics["host_write_commands"].labels(ctrl_dev).inc( + int(smart_log["host_write_commands"]) + ) + metrics["avail_spare"].labels(ctrl_dev).set(smart_log["avail_spare"] / 100) + metrics["spare_thresh"].labels(ctrl_dev).set(smart_log["spare_thresh"] / 100) + metrics["percent_used"].labels(ctrl_dev).set(smart_log["percent_used"] / 100) + metrics["critical_warning"].labels(ctrl_dev).set( + smart_log["critical_warning"]["value"] + ) + metrics["media_errors"].labels(ctrl_dev).inc(int(smart_log["media_errors"])) + metrics["num_err_log_entries"].labels(ctrl_dev).inc( + int(smart_log["num_err_log_entries"]) + ) + metrics["power_cycles"].labels(ctrl_dev).inc(int(smart_log["power_cycles"])) + metrics["power_on_hours"].labels(ctrl_dev).inc(int(smart_log["power_on_hours"])) + metrics["controller_busy_time"].labels(ctrl_dev).inc( + int(smart_log["controller_busy_time"]) + ) + metrics["unsafe_shutdowns"].labels(ctrl_dev).inc(int(smart_log["unsafe_shutdowns"])) + + # NVMe reports temperature in kelvins; convert it to degrees Celsius. + metrics["temperature"].labels(ctrl_dev).set(smart_log["temperature"] - 273) if __name__ == "__main__": From 895fbb1cff573c271233784562896cac73883a94 Mon Sep 17 00:00:00 2001 From: Daniel Swarbrick Date: Thu, 21 Nov 2024 07:34:02 +0100 Subject: [PATCH 4/6] nvme_metrics: introduce nvme_namespace_info metric Introduce info metric to facilitate join-relationship between namespace-specific and controller-specific metrics. e.g. nvme_namespace_info{controller="nvme0",namepace="nvme0n1",nsid="1"} Signed-off-by: Daniel Swarbrick --- nvme_metrics.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/nvme_metrics.py b/nvme_metrics.py index 08e3e6f..4e785a2 100755 --- a/nvme_metrics.py +++ b/nvme_metrics.py @@ -114,6 +114,11 @@ ), # Namespace-specific (e.g. "nvme0n1") metrics + "namespace_info": Info( + "namespace", + "Namespace information", + ["namepace", "nsid", "controller"], namespace=namespace, registry=registry, + ), "physical_size": Gauge( "physical_size_bytes", "Device size in bytes", @@ -179,11 +184,17 @@ def main(): ) for ns in ctrl["Namespaces"]: - device_name = ns["NameSpace"] + ns_dev = ns["NameSpace"] + + metrics["namespace_info"].labels( + ns_dev, + ns["NSID"], + ctrl_dev, + ) - metrics["sector_size"].labels(device_name).set(ns["SectorSize"]) - metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"]) - metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"]) + metrics["sector_size"].labels(ns_dev).set(ns["SectorSize"]) + metrics["physical_size"].labels(ns_dev).set(ns["PhysicalSize"]) + metrics["used_bytes"].labels(ns_dev).set(ns["UsedBytes"]) # Most SSDs (perhaps _all_ consumer grade SSDs) only contain a single namespace. # Fetch the device global SMART log by omitting any --namespace-id flag. From 75f33a7e85ea535e8a293b82eff563c1639456dc Mon Sep 17 00:00:00 2001 From: Daniel Swarbrick Date: Thu, 21 Nov 2024 07:44:18 +0100 Subject: [PATCH 5/6] nvme_metrics: use device label in namespace_info metric Use "device" rather than "namespace" label so that it matches the other namespace-specific metrics. Signed-off-by: Daniel Swarbrick --- nvme_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvme_metrics.py b/nvme_metrics.py index 4e785a2..10b3053 100755 --- a/nvme_metrics.py +++ b/nvme_metrics.py @@ -117,7 +117,7 @@ "namespace_info": Info( "namespace", "Namespace information", - ["namepace", "nsid", "controller"], namespace=namespace, registry=registry, + ["device", "nsid", "controller"], namespace=namespace, registry=registry, ), "physical_size": Gauge( "physical_size_bytes", From 6df15e5f921a929d5c0ab70252ccc4ec5f037475 Mon Sep 17 00:00:00 2001 From: Daniel Swarbrick Date: Thu, 21 Nov 2024 18:08:02 +0100 Subject: [PATCH 6/6] nvme_metrics: adjust wording of comments Signed-off-by: Daniel Swarbrick --- nvme_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nvme_metrics.py b/nvme_metrics.py index 10b3053..fee5796 100755 --- a/nvme_metrics.py +++ b/nvme_metrics.py @@ -24,6 +24,7 @@ metrics = { # fmt: off + # Host-specific metrics "nvmecli": Info( "nvmecli", "nvme-cli tool information", @@ -196,8 +197,7 @@ def main(): metrics["physical_size"].labels(ns_dev).set(ns["PhysicalSize"]) metrics["used_bytes"].labels(ns_dev).set(ns["UsedBytes"]) - # Most SSDs (perhaps _all_ consumer grade SSDs) only contain a single namespace. - # Fetch the device global SMART log by omitting any --namespace-id flag. + # Fetch the controller global SMART log by omitting the --namespace-id flag. smart_log = exec_nvme_json("smart-log", os.path.join("/dev", ctrl["Controller"])) # Various counters in the NVMe specification are 128-bit, which would have to