Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nvme_metrics: refactor metrics to better fit the verbose / nested JSON output of nvme-cli #228

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
197 changes: 101 additions & 96 deletions nvme_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,102 +24,109 @@

metrics = {
# fmt: off
"nvmecli": Info(
dswarbrick marked this conversation as resolved.
Show resolved Hide resolved
"nvmecli",
"nvme-cli tool information",
["version"], namespace=namespace, registry=registry,
),

# Controller-specific (e.g. "nvme0") metrics
"avail_spare": Gauge(
"available_spare_ratio",
"Device available spare ratio",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"controller_busy_time": Counter(
"controller_busy_time_seconds",
"Device controller busy time in seconds",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"controller_info": Info(
"controller",
"Controller information",
["controller", "model", "firmware", "serial", "transport"], namespace=namespace,
registry=registry,
),
"critical_warning": Gauge(
"critical_warning",
"Device critical warning bitmap field",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"data_units_read": Counter(
"data_units_read_total",
"Number of 512-byte data units read by host, reported in thousands",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"data_units_written": Counter(
"data_units_written_total",
"Number of 512-byte data units written by host, reported in thousands",
["device"], namespace=namespace, registry=registry,
),
"device_info": Info(
"device",
"Device information",
["device", "model", "firmware", "serial"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"host_read_commands": Counter(
"host_read_commands_total",
"Device read commands from host",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"host_write_commands": Counter(
"host_write_commands_total",
"Device write commands from host",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"media_errors": Counter(
"media_errors_total",
"Device media errors total",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"num_err_log_entries": Counter(
"num_err_log_entries_total",
"Device error log entry count",
["device"], namespace=namespace, registry=registry,
),
# FIXME: The "nvmecli" metric ought to be an Info type, not a Gauge. However, making this change
# will result in the metric having a "_info" suffix automatically appended, which is arguably
# a breaking change.
"nvmecli": Gauge(
"nvmecli",
"nvme-cli tool information",
["version"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"percent_used": Gauge(
"percentage_used_ratio",
"Device percentage used ratio",
["device"], namespace=namespace, registry=registry,
),
"physical_size": Gauge(
"physical_size_bytes",
"Device size in bytes",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"power_cycles": Counter(
"power_cycles_total",
"Device number of power cycles",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"power_on_hours": Counter(
"power_on_hours_total",
"Device power-on hours",
["device"], namespace=namespace, registry=registry,
),
"sector_size": Gauge(
"sector_size_bytes",
"Device sector size in bytes",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"spare_thresh": Gauge(
"available_spare_threshold_ratio",
"Device available spare threshold ratio",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"temperature": Gauge(
"temperature_celsius",
"Device temperature in degrees Celsius",
["device"], namespace=namespace, registry=registry,
["controller"], namespace=namespace, registry=registry,
),
"unsafe_shutdowns": Counter(
"unsafe_shutdowns_total",
"Device number of unsafe shutdowns",
["controller"], namespace=namespace, registry=registry,
),

# Namespace-specific (e.g. "nvme0n1") metrics
"namespace_info": Info(
"namespace",
"Namespace information",
["device", "nsid", "controller"], namespace=namespace, registry=registry,
),
"physical_size": Gauge(
"physical_size_bytes",
"Device size in bytes",
["device"], namespace=namespace, registry=registry,
),
"sector_size": Gauge(
"sector_size_bytes",
"Device sector size in bytes",
["device"], namespace=namespace, registry=registry,
),
"used_bytes": Gauge(
Expand Down Expand Up @@ -159,75 +166,73 @@ def main():
cli_version = match.group(1)
else:
cli_version = "unknown"
metrics["nvmecli"].labels(cli_version).set(1)
metrics["nvmecli"].labels(cli_version)

device_list = exec_nvme_json("list")

for device in device_list["Devices"]:
for subsys in device["Subsystems"]:
for ctrl in subsys["Controllers"]:
ctrl_dev = ctrl["Controller"]

metrics["controller_info"].labels(
ctrl_dev,
ctrl["ModelNumber"],
ctrl["Firmware"],
ctrl["SerialNumber"].strip(),
ctrl["Transport"],
)

for ns in ctrl["Namespaces"]:
device_name = ns["NameSpace"]

# FIXME: This metric ought to be refactored into a "controller_info" metric,
# since it contains information that is not unique to the namespace. However,
# previous versions of this collector erroneously referred to namespaces, e.g.
# "nvme0n1", as devices, so preserve the former behaviour for now.
metrics["device_info"].labels(
device_name,
ctrl["ModelNumber"],
ctrl["Firmware"],
ctrl["SerialNumber"].strip(),
)
ns_dev = ns["NameSpace"]

metrics["sector_size"].labels(device_name).set(ns["SectorSize"])
metrics["physical_size"].labels(device_name).set(ns["PhysicalSize"])
metrics["used_bytes"].labels(device_name).set(ns["UsedBytes"])

# FIXME: The smart-log should only need to be fetched once per controller, not
# per namespace. However, in order to preserve legacy metric labels, fetch it
# per namespace anyway. Most consumer grade SSDs will only have one namespace.
smart_log = exec_nvme_json("smart-log", os.path.join("/dev", device_name))

# Various counters in the NVMe specification are 128-bit, which would have to
# discard resolution if converted to a JSON number (i.e., float64_t). Instead,
# nvme-cli marshals them as strings. As such, they need to be explicitly cast
# to int or float when using them in Counter metrics.
metrics["data_units_read"].labels(device_name).inc(
int(smart_log["data_units_read"])
)
metrics["data_units_written"].labels(device_name).inc(
int(smart_log["data_units_written"])
)
metrics["host_read_commands"].labels(device_name).inc(
int(smart_log["host_read_commands"])
)
metrics["host_write_commands"].labels(device_name).inc(
int(smart_log["host_write_commands"])
)
metrics["avail_spare"].labels(device_name).set(smart_log["avail_spare"] / 100)
metrics["spare_thresh"].labels(device_name).set(smart_log["spare_thresh"] / 100)
metrics["percent_used"].labels(device_name).set(smart_log["percent_used"] / 100)
metrics["critical_warning"].labels(device_name).set(
smart_log["critical_warning"]["value"]
)
metrics["media_errors"].labels(device_name).inc(int(smart_log["media_errors"]))
metrics["num_err_log_entries"].labels(device_name).inc(
int(smart_log["num_err_log_entries"])
)
metrics["power_cycles"].labels(device_name).inc(int(smart_log["power_cycles"]))
metrics["power_on_hours"].labels(device_name).inc(
int(smart_log["power_on_hours"])
)
metrics["controller_busy_time"].labels(device_name).inc(
int(smart_log["controller_busy_time"])
)
metrics["unsafe_shutdowns"].labels(device_name).inc(
int(smart_log["unsafe_shutdowns"])
metrics["namespace_info"].labels(
ns_dev,
ns["NSID"],
ctrl_dev,
)

# NVMe reports temperature in kelvins; convert it to degrees Celsius.
metrics["temperature"].labels(device_name).set(smart_log["temperature"] - 273)
metrics["sector_size"].labels(ns_dev).set(ns["SectorSize"])
metrics["physical_size"].labels(ns_dev).set(ns["PhysicalSize"])
metrics["used_bytes"].labels(ns_dev).set(ns["UsedBytes"])

# Most SSDs (perhaps _all_ consumer grade SSDs) only contain a single namespace.
dswarbrick marked this conversation as resolved.
Show resolved Hide resolved
# Fetch the device global SMART log by omitting any --namespace-id flag.
dswarbrick marked this conversation as resolved.
Show resolved Hide resolved
smart_log = exec_nvme_json("smart-log", os.path.join("/dev", ctrl["Controller"]))

# Various counters in the NVMe specification are 128-bit, which would have to
# discard resolution if converted to a JSON number (i.e., float64_t). Instead,
# nvme-cli marshals them as strings. As such, they need to be explicitly cast to int
# or float when using them in Counter metrics.
metrics["data_units_read"].labels(ctrl_dev).inc(int(smart_log["data_units_read"]))
metrics["data_units_written"].labels(ctrl_dev).inc(
int(smart_log["data_units_written"])
)
metrics["host_read_commands"].labels(ctrl_dev).inc(
int(smart_log["host_read_commands"])
)
metrics["host_write_commands"].labels(ctrl_dev).inc(
int(smart_log["host_write_commands"])
)
metrics["avail_spare"].labels(ctrl_dev).set(smart_log["avail_spare"] / 100)
metrics["spare_thresh"].labels(ctrl_dev).set(smart_log["spare_thresh"] / 100)
metrics["percent_used"].labels(ctrl_dev).set(smart_log["percent_used"] / 100)
metrics["critical_warning"].labels(ctrl_dev).set(
smart_log["critical_warning"]["value"]
)
metrics["media_errors"].labels(ctrl_dev).inc(int(smart_log["media_errors"]))
metrics["num_err_log_entries"].labels(ctrl_dev).inc(
int(smart_log["num_err_log_entries"])
)
metrics["power_cycles"].labels(ctrl_dev).inc(int(smart_log["power_cycles"]))
metrics["power_on_hours"].labels(ctrl_dev).inc(int(smart_log["power_on_hours"]))
metrics["controller_busy_time"].labels(ctrl_dev).inc(
int(smart_log["controller_busy_time"])
)
metrics["unsafe_shutdowns"].labels(ctrl_dev).inc(int(smart_log["unsafe_shutdowns"]))

# NVMe reports temperature in kelvins; convert it to degrees Celsius.
metrics["temperature"].labels(ctrl_dev).set(smart_log["temperature"] - 273)


if __name__ == "__main__":
Expand Down