From 5a23e0a01136d0a6ffab6da2d39b1d8d7239c4e3 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Fri, 1 Dec 2023 13:20:08 +0000 Subject: [PATCH 1/2] Add conditional hardware panel: temp sensor. --- docs/node-observ-lib/linux/alerts.libsonnet | 512 +++++++++--------- docs/node-observ-lib/linux/config.libsonnet | 5 + .../linux/dashboards.libsonnet | 10 +- docs/node-observ-lib/linux/panels.libsonnet | 6 + docs/node-observ-lib/linux/targets.libsonnet | 8 + 5 files changed, 292 insertions(+), 249 deletions(-) diff --git a/docs/node-observ-lib/linux/alerts.libsonnet b/docs/node-observ-lib/linux/alerts.libsonnet index e7db3fba77..366144da40 100644 --- a/docs/node-observ-lib/linux/alerts.libsonnet +++ b/docs/node-observ-lib/linux/alerts.libsonnet @@ -161,259 +161,275 @@ { // defaults to 'node-exporter for backward compatibility with old node-mixin name: if this.config.uid == 'node' then 'node-exporter' else this.config.uid + '-alerts', - rules: [ - { - alert: 'NodeNetworkReceiveErrs', - expr: ||| - rate(node_network_receive_errs_total{%(filteringSelector)s}[2m]) / rate(node_network_receive_packets_total{%(filteringSelector)s}[2m]) > 0.01 - ||| % this.config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Network interface is reporting many receive errors.', - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.', - }, - }, - { - alert: 'NodeNetworkTransmitErrs', - expr: ||| - rate(node_network_transmit_errs_total{%(filteringSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(filteringSelector)s}[2m]) > 0.01 - ||| % this.config, - 'for': '1h', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Network interface is reporting many transmit errors.', - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.', - }, - }, - { - alert: 'NodeHighNumberConntrackEntriesUsed', - expr: ||| - (node_nf_conntrack_entries{%(filteringSelector)s} / node_nf_conntrack_entries_limit) > 0.75 - ||| % this.config, - annotations: { - summary: 'Number of conntrack are getting close to the limit.', - description: '{{ $value | humanizePercentage }} of conntrack entries are used.', - }, - labels: { - severity: 'warning', - }, - }, - { - alert: 'NodeTextFileCollectorScrapeError', - expr: ||| - node_textfile_scrape_error{%(filteringSelector)s} == 1 - ||| % this.config, - annotations: { - summary: 'Node Exporter text file collector failed to scrape.', - description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.', - }, - labels: { - severity: 'warning', - }, - }, - { - alert: 'NodeClockSkewDetected', - expr: ||| - ( - node_timex_offset_seconds{%(filteringSelector)s} > 0.05 - and - deriv(node_timex_offset_seconds{%(filteringSelector)s}[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds{%(filteringSelector)s} < -0.05 - and - deriv(node_timex_offset_seconds{%(filteringSelector)s}[5m]) <= 0 - ) - ||| % this.config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Clock skew detected.', - description: 'Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', - }, - }, - { - alert: 'NodeClockNotSynchronising', - expr: ||| - min_over_time(node_timex_sync_status{%(filteringSelector)s}[5m]) == 0 - and - node_timex_maxerror_seconds{%(filteringSelector)s} >= 16 - ||| % this.config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Clock not synchronising.', - description: 'Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', - }, - }, - { - alert: 'NodeRAIDDegraded', - expr: ||| - node_md_disks_required{%(filteringSelector)s,%(diskDeviceSelector)s} - ignoring (state) (node_md_disks{state="active",%(filteringSelector)s,%(diskDeviceSelector)s}) > 0 - ||| % this.config, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'RAID Array is degraded.', - description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", - }, - }, - { - alert: 'NodeRAIDDiskFailure', - expr: ||| - node_md_disks{state="failed",%(filteringSelector)s,%(diskDeviceSelector)s} > 0 - ||| % this.config, - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Failed device in RAID array.', - description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", - }, - }, - { - alert: 'NodeFileDescriptorLimit', - expr: ||| - ( - node_filefd_allocated{%(filteringSelector)s} * 100 / node_filefd_maximum{%(filteringSelector)s} > 70 - ) - ||| % this.config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Kernel is predicted to exhaust file descriptors limit soon.', - description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', - }, - }, - { - alert: 'NodeFileDescriptorLimit', - expr: ||| - ( - node_filefd_allocated{%(filteringSelector)s} * 100 / node_filefd_maximum{%(filteringSelector)s} > 90 - ) - ||| % this.config, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Kernel is predicted to exhaust file descriptors limit soon.', - description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', - }, - }, - { - alert: 'NodeCPUHighUsage', - expr: ||| - sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(filteringSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d - ||| % this.config, - 'for': '15m', - labels: { - severity: 'info', - }, - annotations: { - summary: 'High CPU usage.', - description: ||| - CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. + rules: + [ + { + alert: 'NodeNetworkReceiveErrs', + expr: ||| + rate(node_network_receive_errs_total{%(filteringSelector)s}[2m]) / rate(node_network_receive_packets_total{%(filteringSelector)s}[2m]) > 0.01 ||| % this.config, - }, - }, - { - alert: 'NodeSystemSaturation', - expr: ||| - node_load1{%(filteringSelector)s} - / count without (cpu, mode) (node_cpu_seconds_total{%(filteringSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d - ||| % this.config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'System saturated, load per core is very high.', - description: ||| - System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. - This might indicate this instance resources saturation and can cause it becoming unresponsive. + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Network interface is reporting many receive errors.', + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.', + }, + }, + { + alert: 'NodeNetworkTransmitErrs', + expr: ||| + rate(node_network_transmit_errs_total{%(filteringSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(filteringSelector)s}[2m]) > 0.01 ||| % this.config, - }, - }, - { - alert: 'NodeMemoryMajorPagesFaults', - expr: ||| - rate(node_vmstat_pgmajfault{%(filteringSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d - ||| % this.config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Memory major page faults are occurring at very high rate.', - description: ||| - Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. - Please check that there is enough memory available at this instance. + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Network interface is reporting many transmit errors.', + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.', + }, + }, + { + alert: 'NodeHighNumberConntrackEntriesUsed', + expr: ||| + (node_nf_conntrack_entries{%(filteringSelector)s} / node_nf_conntrack_entries_limit) > 0.75 ||| % this.config, - }, - }, - { - alert: 'NodeMemoryHighUtilization', - expr: ||| - 100 - (node_memory_MemAvailable_bytes{%(filteringSelector)s} / node_memory_MemTotal_bytes{%(filteringSelector)s} * 100) > %(memoryHighUtilizationThreshold)d - ||| % this.config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Host is running out of memory.', - description: ||| - Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. + annotations: { + summary: 'Number of conntrack are getting close to the limit.', + description: '{{ $value | humanizePercentage }} of conntrack entries are used.', + }, + labels: { + severity: 'warning', + }, + }, + { + alert: 'NodeTextFileCollectorScrapeError', + expr: ||| + node_textfile_scrape_error{%(filteringSelector)s} == 1 ||| % this.config, - }, - }, - { - alert: 'NodeDiskIOSaturation', - expr: ||| - rate(node_disk_io_time_weighted_seconds_total{%(filteringSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d - ||| % this.config, - 'for': '30m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Disk IO queue is high.', - description: ||| - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. - This symptom might indicate disk saturation. + annotations: { + summary: 'Node Exporter text file collector failed to scrape.', + description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.', + }, + labels: { + severity: 'warning', + }, + }, + { + alert: 'NodeClockSkewDetected', + expr: ||| + ( + node_timex_offset_seconds{%(filteringSelector)s} > 0.05 + and + deriv(node_timex_offset_seconds{%(filteringSelector)s}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{%(filteringSelector)s} < -0.05 + and + deriv(node_timex_offset_seconds{%(filteringSelector)s}[5m]) <= 0 + ) ||| % this.config, - }, - }, - { - alert: 'NodeSystemdServiceFailed', - expr: ||| - node_systemd_unit_state{%(filteringSelector)s, state="failed"} == 1 - ||| % this.config, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'Systemd service has entered failed state.', - description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', - }, - }, - ], + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Clock skew detected.', + description: 'Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', + }, + }, + { + alert: 'NodeClockNotSynchronising', + expr: ||| + min_over_time(node_timex_sync_status{%(filteringSelector)s}[5m]) == 0 + and + node_timex_maxerror_seconds{%(filteringSelector)s} >= 16 + ||| % this.config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Clock not synchronising.', + description: 'Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', + }, + }, + { + alert: 'NodeRAIDDegraded', + expr: ||| + node_md_disks_required{%(filteringSelector)s,%(diskDeviceSelector)s} - ignoring (state) (node_md_disks{state="active",%(filteringSelector)s,%(diskDeviceSelector)s}) > 0 + ||| % this.config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'RAID Array is degraded.', + description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", + }, + }, + { + alert: 'NodeRAIDDiskFailure', + expr: ||| + node_md_disks{state="failed",%(filteringSelector)s,%(diskDeviceSelector)s} > 0 + ||| % this.config, + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Failed device in RAID array.', + description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", + }, + }, + { + alert: 'NodeFileDescriptorLimit', + expr: ||| + ( + node_filefd_allocated{%(filteringSelector)s} * 100 / node_filefd_maximum{%(filteringSelector)s} > 70 + ) + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Kernel is predicted to exhaust file descriptors limit soon.', + description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', + }, + }, + { + alert: 'NodeFileDescriptorLimit', + expr: ||| + ( + node_filefd_allocated{%(filteringSelector)s} * 100 / node_filefd_maximum{%(filteringSelector)s} > 90 + ) + ||| % this.config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Kernel is predicted to exhaust file descriptors limit soon.', + description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', + }, + }, + { + alert: 'NodeCPUHighUsage', + expr: ||| + sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(filteringSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d + ||| % this.config, + 'for': '15m', + labels: { + severity: 'info', + }, + annotations: { + summary: 'High CPU usage.', + description: ||| + CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. + ||| % this.config, + }, + }, + { + alert: 'NodeSystemSaturation', + expr: ||| + node_load1{%(filteringSelector)s} + / count without (cpu, mode) (node_cpu_seconds_total{%(filteringSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'System saturated, load per core is very high.', + description: ||| + System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. + This might indicate this instance resources saturation and can cause it becoming unresponsive. + ||| % this.config, + }, + }, + { + alert: 'NodeMemoryMajorPagesFaults', + expr: ||| + rate(node_vmstat_pgmajfault{%(filteringSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Memory major page faults are occurring at very high rate.', + description: ||| + Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. + Please check that there is enough memory available at this instance. + ||| % this.config, + }, + }, + { + alert: 'NodeMemoryHighUtilization', + expr: ||| + 100 - (node_memory_MemAvailable_bytes{%(filteringSelector)s} / node_memory_MemTotal_bytes{%(filteringSelector)s} * 100) > %(memoryHighUtilizationThreshold)d + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Host is running out of memory.', + description: ||| + Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. + ||| % this.config, + }, + }, + { + alert: 'NodeDiskIOSaturation', + expr: ||| + rate(node_disk_io_time_weighted_seconds_total{%(filteringSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d + ||| % this.config, + 'for': '30m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Disk IO queue is high.', + description: ||| + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. + This symptom might indicate disk saturation. + ||| % this.config, + }, + }, + { + alert: 'NodeSystemdServiceFailed', + expr: ||| + node_systemd_unit_state{%(filteringSelector)s, state="failed"} == 1 + ||| % this.config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Systemd service has entered failed state.', + description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', + }, + }, + ] + + if this.config.enableHardware then + [{ + alert: 'NodeHardwareTemparatureHigh', + expr: ||| + avg_over_time(node_hwmon_temp_celsius{%(filteringSelector)s}[5m]) > %(temperatureWarnTreshold)d + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: "Sensor's temparature is high.", + description: 'Sensor {{ $labels.sensor }} is reporting high temperature on chip {{ $labels.chip }}, and has been above %(temperatureWarnTreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%.', + }, + }] else [], }, ], }, diff --git a/docs/node-observ-lib/linux/config.libsonnet b/docs/node-observ-lib/linux/config.libsonnet index eed54bbab0..94acca6184 100644 --- a/docs/node-observ-lib/linux/config.libsonnet +++ b/docs/node-observ-lib/linux/config.libsonnet @@ -84,6 +84,11 @@ // 'NodeDiskIOSaturation' alert. diskIOSaturationThreshold: 10, + // Enable hardware related panels and alerts (temp sensors) + enableHardware: false, + // Temperature sensor treshold + temperatureWarnTreshold: 80, + rateInterval: '5m', dashboardPeriod: 'now-1h', diff --git a/docs/node-observ-lib/linux/dashboards.libsonnet b/docs/node-observ-lib/linux/dashboards.libsonnet index f09bef6415..a76bf73908 100644 --- a/docs/node-observ-lib/linux/dashboards.libsonnet +++ b/docs/node-observ-lib/linux/dashboards.libsonnet @@ -60,7 +60,15 @@ local logslib = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libso g.panel.row.new('Network'), panels.networkUsagePerSec { gridPos+: { w: 12, h: 8 } }, panels.networkErrorsAndDroppedPerSec { gridPos+: { w: 12, h: 8 } }, - ], 6, 2 + + ] + + + if this.config.enableHardware then + [ + g.panel.row.new('Hardware'), + panels.hardwareTemperature { gridPos+: { w: 12, h: 8 } }, + ] else [] + , 6, 2 ) ) // defaults to uid=nodes for backward compatibility with old node-mixins diff --git a/docs/node-observ-lib/linux/panels.libsonnet b/docs/node-observ-lib/linux/panels.libsonnet index d924ac2eb9..4d0331cccf 100644 --- a/docs/node-observ-lib/linux/panels.libsonnet +++ b/docs/node-observ-lib/linux/panels.libsonnet @@ -1138,5 +1138,11 @@ local utils = commonlib.utils; description='Rate of ICMP messages received and transmitted with errors.' ) + panel.standardOptions.withUnit('err/s'), + + hardwareTemperature: + commonlib.panels.hardware.timeSeries.temperature.new( + 'Temperature', + targets=[t.hardwareTemperature] + ), }, } diff --git a/docs/node-observ-lib/linux/targets.libsonnet b/docs/node-observ-lib/linux/targets.libsonnet index f3b6dcb1ff..54a5354810 100644 --- a/docs/node-observ-lib/linux/targets.libsonnet +++ b/docs/node-observ-lib/linux/targets.libsonnet @@ -1135,5 +1135,13 @@ local lokiQuery = g.query.loki; 'irate(node_netstat_Icmp6_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables ) + prometheusQuery.withLegendFormat('ICMP6 errors'), + + hardwareTemperature: + prometheusQuery.new( + prometheusDatasource, + 'node_hwmon_temp_celsius{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('{{sensor}}'), + }, } From 6ea5228b71c360341101c11c9f005c652f55464d Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Fri, 1 Dec 2023 13:24:15 +0000 Subject: [PATCH 2/2] Update legend --- docs/node-observ-lib/linux/targets.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/node-observ-lib/linux/targets.libsonnet b/docs/node-observ-lib/linux/targets.libsonnet index 54a5354810..cc9dd8e94c 100644 --- a/docs/node-observ-lib/linux/targets.libsonnet +++ b/docs/node-observ-lib/linux/targets.libsonnet @@ -1141,7 +1141,7 @@ local lokiQuery = g.query.loki; prometheusDatasource, 'node_hwmon_temp_celsius{%(queriesSelector)s}' % variables ) - + prometheusQuery.withLegendFormat('{{sensor}}'), + + prometheusQuery.withLegendFormat('{{chip}}/{{sensor}}'), }, }