From 395c7a1e3d447d7ab5341956afb5e68f3139cf3c Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Mon, 3 Jun 2024 12:22:46 +0000 Subject: [PATCH 1/2] Add NodeSystemdServiceCrashlooping alert Signed-off-by: Vitaly Zhuravlev --- docs/node-observ-lib/linux/alerts.libsonnet | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/node-observ-lib/linux/alerts.libsonnet b/docs/node-observ-lib/linux/alerts.libsonnet index 00d33d3d10..00c4457b72 100644 --- a/docs/node-observ-lib/linux/alerts.libsonnet +++ b/docs/node-observ-lib/linux/alerts.libsonnet @@ -414,6 +414,20 @@ description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', }, }, + { + alert: 'NodeSystemdServiceCrashlooping', + expr: ||| + increase(node_systemd_service_restart_total{%(filteringSelector)s}[5m]) > 2 + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Systemd service keeps restaring, possibly crash looping.', + description: 'Systemd service {{ $labels.name }} has being restarted too many times at {{ $labels.instance }} for the last 15 minutes. Please check if service is crash looping.', + }, + }, ] + if this.config.enableHardware then [{ From 9dec977e304e9755ef9c2a8421b0fb6a1e97742c Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Mon, 3 Jun 2024 12:47:53 +0000 Subject: [PATCH 2/2] Fix typo Signed-off-by: Vitaly Zhuravlev --- docs/node-observ-lib/linux/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/node-observ-lib/linux/alerts.libsonnet b/docs/node-observ-lib/linux/alerts.libsonnet index 00c4457b72..8cc89d8fdf 100644 --- a/docs/node-observ-lib/linux/alerts.libsonnet +++ b/docs/node-observ-lib/linux/alerts.libsonnet @@ -425,7 +425,7 @@ }, annotations: { summary: 'Systemd service keeps restaring, possibly crash looping.', - description: 'Systemd service {{ $labels.name }} has being restarted too many times at {{ $labels.instance }} for the last 15 minutes. Please check if service is crash looping.', + description: 'Systemd service {{ $labels.name }} has been restarted too many times at {{ $labels.instance }} for the last 15 minutes. Please check if service is crash looping.', }, }, ]