From e6c8b2149ed637bfced5af092192a0c218e97f16 Mon Sep 17 00:00:00 2001 From: dobbi84 Date: Tue, 6 Feb 2024 08:46:57 +0100 Subject: [PATCH] NVMe metrics collector based on parallel The collector based on parallel allows to collect the metrics quicker when the node has several NVMe drives. Included a Grafana dashboard that visualizes the metrics and for each panel reports a description coming from the NVMe express specifications. Because there are now four files NVMe related everything has been moved to a common directory. Signed-off-by: dobbi84 --- nvme_metrics.py => nvme/nvme_metrics.py | 0 nvme_metrics.sh => nvme/nvme_metrics.sh | 0 nvme/nvme_metrics_dashboard.json | 3047 +++++++++++++++++++++++ nvme/nvme_metrics_parallel.sh | 116 + 4 files changed, 3163 insertions(+) rename nvme_metrics.py => nvme/nvme_metrics.py (100%) rename nvme_metrics.sh => nvme/nvme_metrics.sh (100%) create mode 100644 nvme/nvme_metrics_dashboard.json create mode 100644 nvme/nvme_metrics_parallel.sh diff --git a/nvme_metrics.py b/nvme/nvme_metrics.py similarity index 100% rename from nvme_metrics.py rename to nvme/nvme_metrics.py diff --git a/nvme_metrics.sh b/nvme/nvme_metrics.sh similarity index 100% rename from nvme_metrics.sh rename to nvme/nvme_metrics.sh diff --git a/nvme/nvme_metrics_dashboard.json b/nvme/nvme_metrics_dashboard.json new file mode 100644 index 0000000..7878f36 --- /dev/null +++ b/nvme/nvme_metrics_dashboard.json @@ -0,0 +1,3047 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.2.3" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Detailed overview of the NVME cli smartlog output", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 12736, + "graphTooltip": 0, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": true, + "title": "NVME Express", + "tooltip": "", + "type": "link", + "url": "https://nvmexpress.org/" + }, + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "E4" + ], + "targetBlank": true, + "title": "Related Dashboards", + "tooltip": "", + "type": "dashboards", + "url": "" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 59, + "panels": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "description": "", + "gridPos": { + "h": 18, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 57, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "# NVME Metrics\n\nThis dashboard visualizes the metrics exported by a custom exporter inspired by the [Prometheus Node Exporter \"nvme\" collector and \"text file collector\"](https://github.com/prometheus/node_exporter#textfile-collector), uses a bash script running on scraped instances which parses the results of the command `nvme smart-log` for each NVME device discovered.\n\nFor metrics explanation refer to the [NVME Express specifications](https://nvmexpress.org/wp-content/uploads/NVM-Express-Base-Specification-2.0b-2021.12.18-Ratified.pdf), figure \"207\" at Page 180.\n\n## How to use this dashboard\n\n### Help Information\n\nEach visualization displays a \"information\" link on the top left which explains what is being displayed, if any thresholds are applied and how they are implemented.\n\n### Variables\n\nOn the top of the page it is possible to select \"templated\" variables that helps in scoping the results of the visualizations.\n\nSome of these variables are templated as well by other variables.\n\n - **datasource**: The Prometheus Datasource to gather the metrics from.\n - **cluster_name**: The name of the cluster to which the node belongs, this is usually a static label applied to the scraper job.\n - **node**: The node where the metrics are collected from.\n - **device**: The NVME devices discovered on the given host.\n\n Some variables allow multiple selections and \"All\" option.\n\n ### Repetitions\n\n Selecting multiple variables values will result in visualizations expansion by the number of values:\n\n - **node**: The hosts will expand the graphs inside the rows in the same logic.\n - **device**: The device will expand inside the same graph.", + "mode": "markdown" + }, + "pluginVersion": "10.2.3", + "title": "NVME Disks Dashboard", + "transparent": true, + "type": "text" + } + ], + "title": "Dashboard Help", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 15, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "This field indicates critical warnings for the state of the controller. Each bit corresponds to a critical warning type; multiple bits may be set to ‘1’. If a bit is cleared to ‘0’, then that critical warning does not apply. Critical warnings may result in an asynchronous event notification to the host. Bits in this field represent the current associated state and are not persistent.\nNOTE: The metric representation is in Decimal and we do not account for all 32 combinations in the thresholds, they are simbly set based on the MSB\n\n- No Bit set, metric val 0, color green: No Critical Warnings\n- Bit 0, metric val 1, color turquoise: Available Spare is below Threshold\n- Bit 1, metric val 2, color yellow: Temperature has exceeded Threshold\n- Bit 2, metric val 4, color orange: Reliability is degraded due to excessive media or internal errors\n- Bit 3, metric val 8, color purple: Media is placed in Read- Only Mode\n- Bit 4, metric val 16, color red: Volatile Memory Backup System has failed (e.g., enhanced power loss capacitor test failure) ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#185265" + }, + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 10 + }, + { + "color": "red", + "value": 20 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 11, + "x": 0, + "y": 9 + }, + "id": 10, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_critical_warning_total{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Critical Warnings Total", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "NVME SMART metric critical_warning_total", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 13, + "x": 11, + "y": 9 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate (nvme_critical_warning_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval])", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "NVME Critical Warnings Total Over Time", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the number of occurrences where the controller detected an unrecovered data integrity error. Errors such as uncorrectable ECC, CRC checksum failure, or LBA tag mismatch are included in this field. Errors introduced as a result of a Write Uncorrectable command (refer to the NVM Command Set specification) may or may not be included in this field.\nSince this metric accumulates over time we do not set \"real\" thresholds and the counter becomes violet when above 0.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d36e36" + }, + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 11, + "maxPerRow": 3, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_media_errors_total{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Media and Data Integrity Errors", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the number of Error Information log entries over the life of the controller.\nSince this metric accumulates over time we do not set \"real\" thresholds and the counter becomes violet when above 0.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#7473af" + }, + { + "color": "green", + "value": 0 + }, + { + "color": "#7473af", + "value": 100 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 18, + "maxPerRow": 3, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_num_err_log_entries_total{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Number of Error Information Log Entries", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the overtime view of the Media and Data Integrity errors metric, no rate is calculated.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 1 + }, + { + "color": "red", + "value": 3 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(nvme_media_errors_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval])", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Media and Data Integrity Errors Over Time", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the overtime view of the Number of Error Information Log Entries metric, no rate is calculated.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "#EAB839", + "value": 2 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "nvme0n1" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(nvme_num_err_log_entries_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval])", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Number of Error Information Log Entries Over Time", + "transparent": true, + "type": "timeseries" + } + ], + "repeat": "node", + "repeatDirection": "h", + "title": "Node $node Alerts and Errors", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 17, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Reports the version of the `nvme-cli` installed on the scraped instances.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#889e17", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 10 + }, + "id": 2, + "maxPerRow": 4, + "options": { + "colorMode": "background_solid", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_nvmecli{instance=~\"[[node]]\"}", + "format": "time_series", + "instant": true, + "legendFormat": "{{version}}", + "range": false, + "rawQuery": true, + "refId": "A" + } + ], + "title": "NVME CLI Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "NVME metrics are collected via \"text file Node exporter\", this metric reports the time the metrics have been collected by the local cronjob.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#185265", + "value": null + } + ] + }, + "unit": "dateTimeAsIsoNoDateIfToday" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 3, + "y": 10 + }, + "id": 7, + "maxPerRow": 4, + "options": { + "colorMode": "background_solid", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_metrics_completion_time{instance=~\"[[node]]\"} * 1000", + "format": "table", + "instant": true, + "legendFormat": "", + "range": false, + "refId": "A" + } + ], + "title": "NVME metrics collection time", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "NVME metrics are collected via \"text file Node exporter\", this metric reports how long ago the letrics have been collected by the local cronjob.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d36e36", + "value": null + } + ] + }, + "unit": "dateTimeFromNow" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 5, + "x": 7, + "y": 10 + }, + "id": 6, + "maxPerRow": 4, + "options": { + "colorMode": "background_solid", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_metrics_completion_time{instance=~\"[[node]]\"} * 1000", + "format": "table", + "instant": true, + "range": false, + "refId": "A" + } + ], + "title": "Elapsed NVME metrics collection time", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "mode": "basic", + "type": "color-background" + }, + "inspect": false + }, + "mappings": [ + { + "options": { + "live": { + "color": "green", + "index": 0 + } + }, + "type": "value" + }, + { + "options": { + "pattern": "!live", + "result": { + "color": "red", + "index": 1 + } + }, + "type": "regex" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#18648c", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 10 + }, + "hideTimeOverride": true, + "id": 69, + "links": [], + "maxDataPoints": 100, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": true, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "node_nvme_info{instance=~\"$node\", device=~\"(^nvme[0-9]+$|^sd[a-z]+$)\"}", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Disk Info", + "transformations": [ + { + "id": "groupBy", + "options": { + "fields": { + "device": { + "aggregations": [], + "operation": "groupby" + }, + "firmware_revision": { + "aggregations": [], + "operation": "groupby" + }, + "model": { + "aggregations": [], + "operation": "groupby" + }, + "serial": { + "aggregations": [], + "operation": "groupby" + }, + "state": { + "aggregations": [], + "operation": "groupby" + } + } + } + } + ], + "transparent": true, + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "NVME metrics are collected via \"text file Node exporter\", this metric reports how long ago the letrics have been collected by the local cronjob.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#7473af", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 71, + "maxPerRow": 4, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_unsafe_shutdowns_total{instance=~\"[[node]]\", device=~\"[[device]]\"}", + "format": "time_series", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Unsafe Shutdowns", + "transparent": true, + "type": "stat" + } + ], + "repeat": "node", + "repeatDirection": "h", + "title": "Node $node Info", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 52, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains a vendor specific estimate of the percentage of NVM subsystem life used based on the actual usage and the manufacturer’s prediction of NVM life. A value of 100 indicates that the estimated endurance of the NVM in the NVM subsystem has been consumed, but may not indicate an NVM subsystem failure. The value is allowed to exceed 100. This value shall be updated \nonce per power-on hour (when the controller is not in a sleep state).\n\nThe Gauge will turn:\n- red 100%\n- yellow 80%", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 0.8 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 53, + "options": { + "minVizHeight": 200, + "minVizWidth": 200, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_percentage_used_ratio{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Percentage Used Ratio", + "transparent": true, + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains a normalized percentage (0% to 100%) of the remaining spare capacity available.\nThe gauge thresholds are based on the reported metric \"spare threshold value\" of 10%, therefore:\n- above 20% green\n- between 10% and 20% yellow\n- below 10% red\n-\n-", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "#EAB839", + "value": 0.1 + }, + { + "color": "green", + "value": 0.2 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 54, + "options": { + "minVizHeight": 200, + "minVizWidth": 200, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_available_spare_ratio{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Available Spare Ratio", + "transparent": true, + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#185265" + }, + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 10 + }, + { + "color": "red", + "value": 20 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 70, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_endurance_grp_critical_warning_summary{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Endurance group Critial Warnings", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "When the Available Spare falls below the threshold indicated in this field, an asynchronous event completion may occur. The value is indicated as a normalized percentage (0% to 100%).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "#185265" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 55, + "options": { + "colorMode": "background_solid", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_available_spare_threshold_ratio{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Available Spare Threshold Ratio", + "transparent": true, + "type": "stat" + } + ], + "repeat": "node", + "repeatDirection": "h", + "title": "Node $node Disk Endurance", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 45, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the number of power cycles", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#18648c", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 47, + "maxPerRow": 3, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_power_cycles_total{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Powercycles Total", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": " Contains the number of power-on hours. This may not include time that the controller was powered and in a non-operational power state.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#889e17", + "value": null + } + ] + }, + "unit": "h" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 48, + "maxPerRow": 3, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_power_on_hours_total{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Power ON Time", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains a value corresponding to a temperature in Celsius that represents the current composite temperature of the controller and namespace(s) associated with that controller. Warning and critical overheating composite temperature threshold values are reported by the WCTEMP and CCTEMP fields, these are dependant on the disk model and obtained via `smartctl` command, therefore:\n\n- below 72C green\n- between 72C and 85C yellow\n- above 85C red", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 72 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 49, + "options": { + "minVizHeight": 200, + "minVizWidth": 200, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_temperature_celsius{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Temperature", + "transparent": true, + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Temperature over time, refer to the gauge graph for the help.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 72 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 50, + "maxPerRow": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_temperature_celsius{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Temperature Over Time", + "transparent": true, + "type": "timeseries" + } + ], + "repeat": "node", + "repeatDirection": "h", + "title": "Node $node Disks Vitals", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 30, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the amount of time the controller is busy with I/O commands. The controller is busy when there is a command outstanding to an I/O Queue (specifically, a command was issued via an I/O Submission Queue Tail doorbell write and the corresponding completion queue entry has not been posted yet to the associated I/O Completion Queue).\nThis value is reported in minutes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#185265", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 46, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_controller_busy_time_seconds{instance=~\"[[node]]\", device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Controller Busy Time minutes", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains a rate per second, based on the metric \"controller busy time minutes\" and the selected rate interval.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 21, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 64, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(nvme_controller_busy_time_seconds{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval])", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Controller Busy Time per seconds", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the number of 512 byte data units the host has read from the controller as part of processing a SMART Data Units Read Command; this value does not include metadata. This value is reported in thousands (i.e., a value of 1 corresponds to 1,000 \nunits of 512 bytes read) and is rounded up (e.g., one indicates that the number of 512 byte data units read is from 1 to 1,000, three indicates that the number of 512 byte data units read is from 2,001 to 3,000).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#7473af", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 60, + "options": { + "colorMode": "background_solid", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_data_units_read_total{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Data Units Read", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the number of SMART Host Read Commands completed by the controller.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#18648c", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 62, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_host_read_commands_total{instance=~\"[[node]]\",device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Host Read Commands per", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the per second rate over the selected rate interval for the metric \"Data Units Read\" converted to byte.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(nvme_data_units_read_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval]) * 512000", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Host Read Bytes per Second", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the per second rate, based on the selected rate interval, for the \"Host Read Commands\" metric", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "iops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 41, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(nvme_host_read_commands_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval])", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Host Read Commands per Second", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the per second rate over the selected rate interval of the calculated IO size obtained dividing the Read Units for the Host Read Commands rates ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "binbps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 65, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "(rate(nvme_data_units_read_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval]) / rate(nvme_host_read_commands_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval])) * 512000", + "hide": false, + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Host Read IO size per Second", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the number of 512 byte data units the host has written to the controller as part of processing a User Data Out Command; this value does not include metadata. This value is reported in thousands (i.e., a value of 1 corresponds to 1,000 units of 512 bytes written) and is rounded up (e.g., one indicates that the number of 512 byte data units written is from 1 to 1,000, three indicates that the number of 512 byte data units written is from 2,001 to 3,000).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d36e36" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 61, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_data_units_written_total{device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Data Units Written for $node", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "NVME SMART metric data_units_read_total", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#18648c" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 63, + "options": { + "colorMode": "background_solid", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_host_write_commands_total{device=~\"[[device]]\"}", + "instant": true, + "legendFormat": "{{device}}", + "range": false, + "refId": "A" + } + ], + "title": "Host Write Commands per Second for $node", + "transparent": true, + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Displays the amount of data written on the disks in a timeframe (offset) specified by the rate variable value of $rate ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 52 + }, + "id": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "(nvme_data_units_written_total{instance=~\"[[node]]\",device=~\"[[device]]\"}offset $__rate_interval) * 512000", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Host Write Bytes Total for $node", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the per second rate over the selected rate interval for the metric \"Data Units Write\" converted to byte.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 52 + }, + "id": 67, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(nvme_data_units_written_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval]) * 512000", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Host Write Bytes per Second for $node", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the per second rate over the selected rate interval of the calculated IO size obtained dividing the Write Units for the Host Write Commands rates ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 61 + }, + "id": 66, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "(rate(nvme_data_units_written_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval]) / rate(nvme_host_write_commands_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval])) * 512000", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Host Write IO size per Second for $node", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Contains the rate per second, based on the selected rate interval, for the metric \"Host Write Commands\"", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "iops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 61 + }, + "id": 42, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(nvme_host_write_commands_total{instance=~\"[[node]]\",device=~\"[[device]]\"}[$__rate_interval])", + "instant": false, + "legendFormat": "{{device}}", + "range": true, + "refId": "A" + } + ], + "title": "Host Write Commands per Second for $node", + "transparent": true, + "type": "timeseries" + } + ], + "repeat": "node", + "repeatDirection": "h", + "title": "Node $node Disks Stats", + "type": "row" + } + ], + "refresh": "10s", + "revision": 1, + "schemaVersion": 39, + "tags": [ + "NVME", + "Storage", + "E4" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "dbbfb289-0b79-48fb-b3a1-155ba7eee081" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(node_uname_info,cluster_name)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "cluster_name", + "options": [], + "query": { + "query": "label_values(node_uname_info,cluster_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(nvme_nvmecli{cluster_name=\"$cluster_name\"},instance)", + "hide": 0, + "includeAll": true, + "label": "node", + "multi": true, + "name": "node", + "options": [], + "query": { + "query": "label_values(nvme_nvmecli{cluster_name=\"$cluster_name\"},instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(nvme_power_on_hours_total{instance=~\"$node\"},device)", + "hide": 0, + "includeAll": true, + "label": "device", + "multi": true, + "name": "device", + "options": [], + "query": { + "query": "label_values(nvme_power_on_hours_total{instance=~\"$node\"},device)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m" + ] + }, + "timezone": "", + "title": "NVME", + "uid": "WwP1Px4Mz", + "version": 22, + "weekStart": "" +} diff --git a/nvme/nvme_metrics_parallel.sh b/nvme/nvme_metrics_parallel.sh new file mode 100644 index 0000000..6c89b18 --- /dev/null +++ b/nvme/nvme_metrics_parallel.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash + +# Script to capture NVME drives smart metrics using GNU parallel +# +# Dependencies: +# - nvme-cli 1.16 +# - jq 1.6-6 +# - parallel 20190922-1 +# +# Based on code from +# - https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/blob/master/nvme/nvme_metrics.sh +# +# Example Grafana dashboard: +# - https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/blob/master/nvme/nvme_metrics_dashboard.json +# +# Author: Davide Obbi + +set -eu + +# Ensure predictable numeric / date formats, etc. +export LC_ALL=C + +# Check if we are root +if [[ "${EUID}" -ne 0 ]]; then + echo "${0##*/}: Please run as root!" >&2 + exit 1 +fi + +# Check if programs are installed +if ! command -v nvme >/dev/null 2>&1 || ! command -v jq >/dev/null 2>&1 || ! command -v parallel >/dev/null 2>&1 ; then + echo "${0##*/}: nvme-cli nor jq nor parallel are not installed. Aborting." >&2 + exit 1 +fi + +output_format_awk="$( + cat <<'OUTPUTAWK' +BEGIN { v = "" } +v != $1 { + print "# HELP nvme_" $1 " SMART metric " $1; + if ($1 ~ /_total$/) + print "# TYPE nvme_" $1 " counter"; + else + print "# TYPE nvme_" $1 " gauge"; + v = $1 +} +{print "nvme_" $0} +OUTPUTAWK +)" + +format_output() { + sort | awk -F'{' "${output_format_awk}" +} + +# Get the nvme-cli version +nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" +echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output + +get_data() { + json_check="$(nvme smart-log -o json "${1}")" + disk="${1##*/}" + + # The temperature value in JSON is in Kelvin, we want Celsius + value_temperature="$(echo "${json_check}" | jq '.temperature - 273')" + echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}" + + value_available_spare="$(echo "${json_check}" | jq '.avail_spare / 100')" + echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}" + + value_available_spare_threshold="$(echo "${json_check}" | jq '.spare_thresh / 100')" + echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}" + + value_percentage_used="$(echo "${json_check}" | jq '.percent_used / 100')" + echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}" + + value_critical_warning="$(echo "${json_check}" | jq '.critical_warning')" + echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}" + + value_media_errors="$(echo "${json_check}" | jq -r '.media_errors')" + echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}" + + value_num_err_log_entries="$(echo "${json_check}" | jq -r '.num_err_log_entries')" + echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}" + + value_power_cycles="$(echo "${json_check}" | jq -r '.power_cycles')" + echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}" + + value_power_on_hours="$(echo "${json_check}" | jq -r '.power_on_hours')" + echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}" + + value_controller_busy_time="$(echo "${json_check}" | jq -r '.controller_busy_time')" + echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}" + + value_data_units_written="$(echo "${json_check}" | jq -r '.data_units_written')" + echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}" + + value_data_units_read="$(echo "${json_check}" | jq -r '.data_units_read')" + echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}" + + value_host_read_commands="$(echo "${json_check}" | jq -r '.host_read_commands')" + echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}" + + value_host_write_commands="$(echo "${json_check}" | jq -r '.host_write_commands')" + echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}" + + value_unsafe_shutdowns="$(echo "${json_check}" | jq -r '.unsafe_shutdowns')" + echo "unsafe_shutdowns_total{device=\"${disk}\"} ${value_unsafe_shutdowns}" + + value_endurance_grp_critical_warning_summary="$(echo "${json_check}" | jq -r '.endurance_grp_critical_warning_summary')" + echo "endurance_grp_critical_warning_summary{device=\"${disk}\"} ${value_endurance_grp_critical_warning_summary}" + +} + +export -f get_data +export HOME="/tmp" + +nvme list -o json | jq -r '.Devices | .[].DevicePath' | parallel get_data | format_output