From 38a62bfe6c80dbaa040d64cfbc8a6412de172b0a Mon Sep 17 00:00:00 2001 From: Daniel Swarbrick Date: Thu, 21 Nov 2024 07:24:43 +0100 Subject: [PATCH] Fetch device global SMART log once per controller Change "device" label in controller-specific metrics to "controller". This also means that the label value will be the NVMe character device name, e.g. "nvme0", instead of the previously used namespace block device name, e.g. "nvme0n1". Separate metric declaration dict into controller-specific and namespace-specific groups for easier maintenance. Signed-off-by: Daniel Swarbrick --- nvme_metrics.py | 147 ++++++++++++++++++++++++------------------------ 1 file changed, 73 insertions(+), 74 deletions(-) diff --git a/nvme_metrics.py b/nvme_metrics.py index f148750..08e3e6f 100755 --- a/nvme_metrics.py +++ b/nvme_metrics.py @@ -24,15 +24,22 @@ metrics = { # fmt: off + "nvmecli": Info( + "nvmecli", + "nvme-cli tool information", + ["version"], namespace=namespace, registry=registry, + ), + + # Controller-specific (e.g. "nvme0") metrics "avail_spare": Gauge( "available_spare_ratio", "Device available spare ratio", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "controller_busy_time": Counter( "controller_busy_time_seconds", "Device controller busy time in seconds", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "controller_info": Info( "controller", @@ -43,81 +50,78 @@ "critical_warning": Gauge( "critical_warning", "Device critical warning bitmap field", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "data_units_read": Counter( "data_units_read_total", "Number of 512-byte data units read by host, reported in thousands", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "data_units_written": Counter( "data_units_written_total", "Number of 512-byte data units written by host, reported in thousands", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "host_read_commands": Counter( "host_read_commands_total", "Device read commands from host", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "host_write_commands": Counter( "host_write_commands_total", "Device write commands from host", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "media_errors": Counter( "media_errors_total", "Device media errors total", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "num_err_log_entries": Counter( "num_err_log_entries_total", "Device error log entry count", - ["device"], namespace=namespace, registry=registry, - ), - "nvmecli": Info( - "nvmecli", - "nvme-cli tool information", - ["version"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "percent_used": Gauge( "percentage_used_ratio", "Device percentage used ratio", - ["device"], namespace=namespace, registry=registry, - ), - "physical_size": Gauge( - "physical_size_bytes", - "Device size in bytes", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "power_cycles": Counter( "power_cycles_total", "Device number of power cycles", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "power_on_hours": Counter( "power_on_hours_total", "Device power-on hours", - ["device"], namespace=namespace, registry=registry, - ), - "sector_size": Gauge( - "sector_size_bytes", - "Device sector size in bytes", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "spare_thresh": Gauge( "available_spare_threshold_ratio", "Device available spare threshold ratio", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "temperature": Gauge( "temperature_celsius", "Device temperature in degrees Celsius", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "unsafe_shutdowns": Counter( "unsafe_shutdowns_total", "Device number of unsafe shutdowns", + ["controller"], namespace=namespace, registry=registry, + ), + + # Namespace-specific (e.g. "nvme0n1") metrics + "physical_size": Gauge( + "physical_size_bytes", + "Device size in bytes", + ["device"], namespace=namespace, registry=registry, + ), + "sector_size": Gauge( + "sector_size_bytes", + "Device sector size in bytes", ["device"], namespace=namespace, registry=registry, ), "used_bytes": Gauge( @@ -164,8 +168,10 @@ def main(): for device in device_list["Devices"]: for subsys in device["Subsystems"]: for ctrl in subsys["Controllers"]: + ctrl_dev = ctrl["Controller"] + metrics["controller_info"].labels( - ctrl["Controller"], + ctrl_dev, ctrl["ModelNumber"], ctrl["Firmware"], ctrl["SerialNumber"].strip(), @@ -179,50 +185,43 @@ def main(): metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"]) metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"]) - # FIXME: The smart-log should only need to be fetched once per controller, not - # per namespace. However, in order to preserve legacy metric labels, fetch it - # per namespace anyway. Most consumer grade SSDs will only have one namespace. - smart_log = exec_nvme_json("smart-log", os.path.join("/dev", device_name)) - - # Various counters in the NVMe specification are 128-bit, which would have to - # discard resolution if converted to a JSON number (i.e., float64_t). Instead, - # nvme-cli marshals them as strings. As such, they need to be explicitly cast - # to int or float when using them in Counter metrics. - metrics["data_units_read"].labels(device_name).inc( - int(smart_log["data_units_read"]) - ) - metrics["data_units_written"].labels(device_name).inc( - int(smart_log["data_units_written"]) - ) - metrics["host_read_commands"].labels(device_name).inc( - int(smart_log["host_read_commands"]) - ) - metrics["host_write_commands"].labels(device_name).inc( - int(smart_log["host_write_commands"]) - ) - metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100) - metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100) - metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100) - metrics["critical_warning"].labels(device_name).set( - smart_log["critical_warning"]["value"] - ) - metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"])) - metrics["num_err_log_entries"].labels(device_name).inc( - int(smart_log["num_err_log_entries"]) - ) - metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"])) - metrics["power_on_hours"].labels(device_name).inc( - int(smart_log["power_on_hours"]) - ) - metrics["controller_busy_time"].labels(device_name).inc( - int(smart_log["controller_busy_time"]) - ) - metrics["unsafe_shutdowns"].labels(device_name).inc( - int(smart_log["unsafe_shutdowns"]) - ) - - # NVMe reports temperature in kelvins; convert it to degrees Celsius. - metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273) + # Most SSDs (perhaps _all_ consumer grade SSDs) only contain a single namespace. + # Fetch the device global SMART log by omitting any --namespace-id flag. + smart_log = exec_nvme_json("smart-log", os.path.join("/dev", ctrl["Controller"])) + + # Various counters in the NVMe specification are 128-bit, which would have to + # discard resolution if converted to a JSON number (i.e., float64_t). Instead, + # nvme-cli marshals them as strings. As such, they need to be explicitly cast to int + # or float when using them in Counter metrics. + metrics["data_units_read"].labels(ctrl_dev).inc(int(smart_log["data_units_read"])) + metrics["data_units_written"].labels(ctrl_dev).inc( + int(smart_log["data_units_written"]) + ) + metrics["host_read_commands"].labels(ctrl_dev).inc( + int(smart_log["host_read_commands"]) + ) + metrics["host_write_commands"].labels(ctrl_dev).inc( + int(smart_log["host_write_commands"]) + ) + metrics["avail_spare"].labels(ctrl_dev).set(smart_log["avail_spare"] / 100) + metrics["spare_thresh"].labels(ctrl_dev).set(smart_log["spare_thresh"] / 100) + metrics["percent_used"].labels(ctrl_dev).set(smart_log["percent_used"] / 100) + metrics["critical_warning"].labels(ctrl_dev).set( + smart_log["critical_warning"]["value"] + ) + metrics["media_errors"].labels(ctrl_dev).inc(int(smart_log["media_errors"])) + metrics["num_err_log_entries"].labels(ctrl_dev).inc( + int(smart_log["num_err_log_entries"]) + ) + metrics["power_cycles"].labels(ctrl_dev).inc(int(smart_log["power_cycles"])) + metrics["power_on_hours"].labels(ctrl_dev).inc(int(smart_log["power_on_hours"])) + metrics["controller_busy_time"].labels(ctrl_dev).inc( + int(smart_log["controller_busy_time"]) + ) + metrics["unsafe_shutdowns"].labels(ctrl_dev).inc(int(smart_log["unsafe_shutdowns"])) + + # NVMe reports temperature in kelvins; convert it to degrees Celsius. + metrics["temperature"].labels(ctrl_dev).set(smart_log["temperature"] - 273) if __name__ == "__main__":