Skip to content

Commit

Permalink
Fetch device global SMART log once per controller
Browse files Browse the repository at this point in the history
Change "device" label in controller-specific metrics to "controller".
This also means that the label value will be the NVMe character device
name, e.g. "nvme0", instead of the previously used namespace block
device name, e.g. "nvme0n1".

Separate metric declaration dict into controller-specific and
namespace-specific groups for easier maintenance.

Signed-off-by: Daniel Swarbrick <[email protected]>
  • Loading branch information
dswarbrick committed Nov 21, 2024
1 parent 4e2c7e1 commit 38a62bf
Showing 1 changed file with 73 additions and 74 deletions.
147 changes: 73 additions & 74 deletions nvme_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,22 @@

metrics = {
# fmt: off
"nvmecli": Info(
"nvmecli",
"nvme-cli tool information",
["version"], namespace=namespace, registry=registry,
),

# Controller-specific (e.g. "nvme0") metrics
"avail_spare": Gauge(
"available_spare_ratio",
"Device available spare ratio",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"controller_busy_time": Counter(
"controller_busy_time_seconds",
"Device controller busy time in seconds",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"controller_info": Info(
"controller",
Expand All @@ -43,81 +50,78 @@
"critical_warning": Gauge(
"critical_warning",
"Device critical warning bitmap field",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"data_units_read": Counter(
"data_units_read_total",
"Number of 512-byte data units read by host, reported in thousands",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"data_units_written": Counter(
"data_units_written_total",
"Number of 512-byte data units written by host, reported in thousands",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"host_read_commands": Counter(
"host_read_commands_total",
"Device read commands from host",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"host_write_commands": Counter(
"host_write_commands_total",
"Device write commands from host",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"media_errors": Counter(
"media_errors_total",
"Device media errors total",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"num_err_log_entries": Counter(
"num_err_log_entries_total",
"Device error log entry count",
["device"], namespace=namespace, registry=registry,
),
"nvmecli": Info(
"nvmecli",
"nvme-cli tool information",
["version"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"percent_used": Gauge(
"percentage_used_ratio",
"Device percentage used ratio",
["device"], namespace=namespace, registry=registry,
),
"physical_size": Gauge(
"physical_size_bytes",
"Device size in bytes",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"power_cycles": Counter(
"power_cycles_total",
"Device number of power cycles",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"power_on_hours": Counter(
"power_on_hours_total",
"Device power-on hours",
["device"], namespace=namespace, registry=registry,
),
"sector_size": Gauge(
"sector_size_bytes",
"Device sector size in bytes",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"spare_thresh": Gauge(
"available_spare_threshold_ratio",
"Device available spare threshold ratio",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"temperature": Gauge(
"temperature_celsius",
"Device temperature in degrees Celsius",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"unsafe_shutdowns": Counter(
"unsafe_shutdowns_total",
"Device number of unsafe shutdowns",
["controller"], namespace=namespace, registry=registry,
),

# Namespace-specific (e.g. "nvme0n1") metrics
"physical_size": Gauge(
"physical_size_bytes",
"Device size in bytes",
["device"], namespace=namespace, registry=registry,
),
"sector_size": Gauge(
"sector_size_bytes",
"Device sector size in bytes",
["device"], namespace=namespace, registry=registry,
),
"used_bytes": Gauge(
Expand Down Expand Up @@ -164,8 +168,10 @@ def main():
for device in device_list["Devices"]:
for subsys in device["Subsystems"]:
for ctrl in subsys["Controllers"]:
ctrl_dev = ctrl["Controller"]

metrics["controller_info"].labels(
ctrl["Controller"],
ctrl_dev,
ctrl["ModelNumber"],
ctrl["Firmware"],
ctrl["SerialNumber"].strip(),
Expand All @@ -179,50 +185,43 @@ def main():
metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"])
metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"])

# FIXME: The smart-log should only need to be fetched once per controller, not
# per namespace. However, in order to preserve legacy metric labels, fetch it
# per namespace anyway. Most consumer grade SSDs will only have one namespace.
smart_log = exec_nvme_json("smart-log", os.path.join("/dev", device_name))

# Various counters in the NVMe specification are 128-bit, which would have to
# discard resolution if converted to a JSON number (i.e., float64_t). Instead,
# nvme-cli marshals them as strings. As such, they need to be explicitly cast
# to int or float when using them in Counter metrics.
metrics["data_units_read"].labels(device_name).inc(
int(smart_log["data_units_read"])
)
metrics["data_units_written"].labels(device_name).inc(
int(smart_log["data_units_written"])
)
metrics["host_read_commands"].labels(device_name).inc(
int(smart_log["host_read_commands"])
)
metrics["host_write_commands"].labels(device_name).inc(
int(smart_log["host_write_commands"])
)
metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100)
metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100)
metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100)
metrics["critical_warning"].labels(device_name).set(
smart_log["critical_warning"]["value"]
)
metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"]))
metrics["num_err_log_entries"].labels(device_name).inc(
int(smart_log["num_err_log_entries"])
)
metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"]))
metrics["power_on_hours"].labels(device_name).inc(
int(smart_log["power_on_hours"])
)
metrics["controller_busy_time"].labels(device_name).inc(
int(smart_log["controller_busy_time"])
)
metrics["unsafe_shutdowns"].labels(device_name).inc(
int(smart_log["unsafe_shutdowns"])
)

# NVMe reports temperature in kelvins; convert it to degrees Celsius.
metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273)
# Most SSDs (perhaps _all_ consumer grade SSDs) only contain a single namespace.
# Fetch the device global SMART log by omitting any --namespace-id flag.
smart_log = exec_nvme_json("smart-log", os.path.join("/dev", ctrl["Controller"]))

# Various counters in the NVMe specification are 128-bit, which would have to
# discard resolution if converted to a JSON number (i.e., float64_t). Instead,
# nvme-cli marshals them as strings. As such, they need to be explicitly cast to int
# or float when using them in Counter metrics.
metrics["data_units_read"].labels(ctrl_dev).inc(int(smart_log["data_units_read"]))
metrics["data_units_written"].labels(ctrl_dev).inc(
int(smart_log["data_units_written"])
)
metrics["host_read_commands"].labels(ctrl_dev).inc(
int(smart_log["host_read_commands"])
)
metrics["host_write_commands"].labels(ctrl_dev).inc(
int(smart_log["host_write_commands"])
)
metrics["avail_spare"].labels(ctrl_dev).set(smart_log["avail_spare"] / 100)
metrics["spare_thresh"].labels(ctrl_dev).set(smart_log["spare_thresh"] / 100)
metrics["percent_used"].labels(ctrl_dev).set(smart_log["percent_used"] / 100)
metrics["critical_warning"].labels(ctrl_dev).set(
smart_log["critical_warning"]["value"]
)
metrics["media_errors"].labels(ctrl_dev).inc(int(smart_log["media_errors"]))
metrics["num_err_log_entries"].labels(ctrl_dev).inc(
int(smart_log["num_err_log_entries"])
)
metrics["power_cycles"].labels(ctrl_dev).inc(int(smart_log["power_cycles"]))
metrics["power_on_hours"].labels(ctrl_dev).inc(int(smart_log["power_on_hours"]))
metrics["controller_busy_time"].labels(ctrl_dev).inc(
int(smart_log["controller_busy_time"])
)
metrics["unsafe_shutdowns"].labels(ctrl_dev).inc(int(smart_log["unsafe_shutdowns"]))

# NVMe reports temperature in kelvins; convert it to degrees Celsius.
metrics["temperature"].labels(ctrl_dev).set(smart_log["temperature"] - 273)


if __name__ == "__main__":
Expand Down

0 comments on commit 38a62bf

Please sign in to comment.