From e8deda2719db3254ba590c7a29dd4ca8db8e4343 Mon Sep 17 00:00:00 2001 From: v-zhuravlev Date: Mon, 3 Jun 2024 21:49:10 +0800 Subject: [PATCH] Add systemd service crashlooping alert (#30) * Add NodeSystemdServiceCrashlooping alert Signed-off-by: Vitaly Zhuravlev * Fix typo Signed-off-by: Vitaly Zhuravlev --------- Signed-off-by: Vitaly Zhuravlev --- docs/node-observ-lib/linux/alerts.libsonnet | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/node-observ-lib/linux/alerts.libsonnet b/docs/node-observ-lib/linux/alerts.libsonnet index 00d33d3d10..8cc89d8fdf 100644 --- a/docs/node-observ-lib/linux/alerts.libsonnet +++ b/docs/node-observ-lib/linux/alerts.libsonnet @@ -414,6 +414,20 @@ description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', }, }, + { + alert: 'NodeSystemdServiceCrashlooping', + expr: ||| + increase(node_systemd_service_restart_total{%(filteringSelector)s}[5m]) > 2 + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Systemd service keeps restaring, possibly crash looping.', + description: 'Systemd service {{ $labels.name }} has been restarted too many times at {{ $labels.instance }} for the last 15 minutes. Please check if service is crash looping.', + }, + }, ] + if this.config.enableHardware then [{