diff --git a/nvme_metrics.py b/nvme_metrics.py index f148750..08e3e6f 100755 --- a/nvme_metrics.py +++ b/nvme_metrics.py @@ -24,15 +24,22 @@ metrics = { # fmt: off + "nvmecli": Info( + "nvmecli", + "nvme-cli tool information", + ["version"], namespace=namespace, registry=registry, + ), + + # Controller-specific (e.g. "nvme0") metrics "avail_spare": Gauge( "available_spare_ratio", "Device available spare ratio", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "controller_busy_time": Counter( "controller_busy_time_seconds", "Device controller busy time in seconds", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "controller_info": Info( "controller", @@ -43,81 +50,78 @@ "critical_warning": Gauge( "critical_warning", "Device critical warning bitmap field", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "data_units_read": Counter( "data_units_read_total", "Number of 512-byte data units read by host, reported in thousands", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "data_units_written": Counter( "data_units_written_total", "Number of 512-byte data units written by host, reported in thousands", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "host_read_commands": Counter( "host_read_commands_total", "Device read commands from host", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "host_write_commands": Counter( "host_write_commands_total", "Device write commands from host", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "media_errors": Counter( "media_errors_total", "Device media errors total", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "num_err_log_entries": Counter( "num_err_log_entries_total", "Device error log entry count", - ["device"], namespace=namespace, registry=registry, - ), - "nvmecli": Info( - "nvmecli", - "nvme-cli tool information", - ["version"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "percent_used": Gauge( "percentage_used_ratio", "Device percentage used ratio", - ["device"], namespace=namespace, registry=registry, - ), - "physical_size": Gauge( - "physical_size_bytes", - "Device size in bytes", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "power_cycles": Counter( "power_cycles_total", "Device number of power cycles", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "power_on_hours": Counter( "power_on_hours_total", "Device power-on hours", - ["device"], namespace=namespace, registry=registry, - ), - "sector_size": Gauge( - "sector_size_bytes", - "Device sector size in bytes", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "spare_thresh": Gauge( "available_spare_threshold_ratio", "Device available spare threshold ratio", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "temperature": Gauge( "temperature_celsius", "Device temperature in degrees Celsius", - ["device"], namespace=namespace, registry=registry, + ["controller"], namespace=namespace, registry=registry, ), "unsafe_shutdowns": Counter( "unsafe_shutdowns_total", "Device number of unsafe shutdowns", + ["controller"], namespace=namespace, registry=registry, + ), + + # Namespace-specific (e.g. "nvme0n1") metrics + "physical_size": Gauge( + "physical_size_bytes", + "Device size in bytes", + ["device"], namespace=namespace, registry=registry, + ), + "sector_size": Gauge( + "sector_size_bytes", + "Device sector size in bytes", ["device"], namespace=namespace, registry=registry, ), "used_bytes": Gauge( @@ -164,8 +168,10 @@ def main(): for device in device_list["Devices"]: for subsys in device["Subsystems"]: for ctrl in subsys["Controllers"]: + ctrl_dev = ctrl["Controller"] + metrics["controller_info"].labels( - ctrl["Controller"], + ctrl_dev, ctrl["ModelNumber"], ctrl["Firmware"], ctrl["SerialNumber"].strip(), @@ -179,50 +185,43 @@ def main(): metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"]) metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"]) - # FIXME: The smart-log should only need to be fetched once per controller, not - # per namespace. However, in order to preserve legacy metric labels, fetch it - # per namespace anyway. Most consumer grade SSDs will only have one namespace. - smart_log = exec_nvme_json("smart-log", os.path.join("/dev", device_name)) - - # Various counters in the NVMe specification are 128-bit, which would have to - # discard resolution if converted to a JSON number (i.e., float64_t). Instead, - # nvme-cli marshals them as strings. As such, they need to be explicitly cast - # to int or float when using them in Counter metrics. - metrics["data_units_read"].labels(device_name).inc( - int(smart_log["data_units_read"]) - ) - metrics["data_units_written"].labels(device_name).inc( - int(smart_log["data_units_written"]) - ) - metrics["host_read_commands"].labels(device_name).inc( - int(smart_log["host_read_commands"]) - ) - metrics["host_write_commands"].labels(device_name).inc( - int(smart_log["host_write_commands"]) - ) - metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100) - metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100) - metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100) - metrics["critical_warning"].labels(device_name).set( - smart_log["critical_warning"]["value"] - ) - metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"])) - metrics["num_err_log_entries"].labels(device_name).inc( - int(smart_log["num_err_log_entries"]) - ) - metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"])) - metrics["power_on_hours"].labels(device_name).inc( - int(smart_log["power_on_hours"]) - ) - metrics["controller_busy_time"].labels(device_name).inc( - int(smart_log["controller_busy_time"]) - ) - metrics["unsafe_shutdowns"].labels(device_name).inc( - int(smart_log["unsafe_shutdowns"]) - ) - - # NVMe reports temperature in kelvins; convert it to degrees Celsius. - metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273) + # Most SSDs (perhaps _all_ consumer grade SSDs) only contain a single namespace. + # Fetch the device global SMART log by omitting any --namespace-id flag. + smart_log = exec_nvme_json("smart-log", os.path.join("/dev", ctrl["Controller"])) + + # Various counters in the NVMe specification are 128-bit, which would have to + # discard resolution if converted to a JSON number (i.e., float64_t). Instead, + # nvme-cli marshals them as strings. As such, they need to be explicitly cast to int + # or float when using them in Counter metrics. + metrics["data_units_read"].labels(ctrl_dev).inc(int(smart_log["data_units_read"])) + metrics["data_units_written"].labels(ctrl_dev).inc( + int(smart_log["data_units_written"]) + ) + metrics["host_read_commands"].labels(ctrl_dev).inc( + int(smart_log["host_read_commands"]) + ) + metrics["host_write_commands"].labels(ctrl_dev).inc( + int(smart_log["host_write_commands"]) + ) + metrics["avail_spare"].labels(ctrl_dev).set(smart_log["avail_spare"] / 100) + metrics["spare_thresh"].labels(ctrl_dev).set(smart_log["spare_thresh"] / 100) + metrics["percent_used"].labels(ctrl_dev).set(smart_log["percent_used"] / 100) + metrics["critical_warning"].labels(ctrl_dev).set( + smart_log["critical_warning"]["value"] + ) + metrics["media_errors"].labels(ctrl_dev).inc(int(smart_log["media_errors"])) + metrics["num_err_log_entries"].labels(ctrl_dev).inc( + int(smart_log["num_err_log_entries"]) + ) + metrics["power_cycles"].labels(ctrl_dev).inc(int(smart_log["power_cycles"])) + metrics["power_on_hours"].labels(ctrl_dev).inc(int(smart_log["power_on_hours"])) + metrics["controller_busy_time"].labels(ctrl_dev).inc( + int(smart_log["controller_busy_time"]) + ) + metrics["unsafe_shutdowns"].labels(ctrl_dev).inc(int(smart_log["unsafe_shutdowns"])) + + # NVMe reports temperature in kelvins; convert it to degrees Celsius. + metrics["temperature"].labels(ctrl_dev).set(smart_log["temperature"] - 273) if __name__ == "__main__":