From 8a64c5c172812f5288d04f2a528678b77cef3e2d Mon Sep 17 00:00:00 2001 From: Seunghun Lee Date: Thu, 16 May 2024 12:04:47 +0100 Subject: [PATCH 1/2] Add alerts for low available swap space --- .../kolla/config/prometheus/system.rules | 18 ++++++++++++++++++ etc/kayobe/stackhpc-monitoring.yml | 6 ++++++ 2 files changed, 24 insertions(+) diff --git a/etc/kayobe/kolla/config/prometheus/system.rules b/etc/kayobe/kolla/config/prometheus/system.rules index 613368be6..7981a5609 100644 --- a/etc/kayobe/kolla/config/prometheus/system.rules +++ b/etc/kayobe/kolla/config/prometheus/system.rules @@ -24,6 +24,24 @@ groups: summary: "Prometheus exporter at {{ $labels.instance }} reports low memory" description: "Available memory is {{ $value }} GiB." + - alert: LowSwapSpace + expr: (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes) < {% endraw %}{{ alertmanager_node_free_swap_warning_threshold_ratio }}{% raw %} + for: 1m + labels: + severity: warning + annotations: + summary: "Swap space at {{ $labels.instance }} reports low memory" + description: "Available swap space is {{ $value | humanizePercentage }}. Running out of swap space causes OOM Kills." + + - alert: LowSwapSpace + expr: (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes) < {% endraw %}{{ alertmanager_node_free_swap_critical_threshold_ratio }}{% raw %} + for: 1m + labels: + severity: critical + annotations: + summary: "Swap space at {{ $labels.instance }} reports low memory" + description: "Available swap space is {{ $value | humanizePercentage }}. Running out of swap space causes OOM Kills." + - alert: HostOomKillDetected expr: increase(node_vmstat_oom_kill[5m]) > 0 for: 5m diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index e8e0bb91f..185a87ebf 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -12,6 +12,12 @@ alertmanager_low_memory_threshold_gib: 5 # link. Change to false to disable this alert. alertmanager_warn_network_bond_single_link: true +# Threshold to trigger an LowSwapSpace alert on swap space depletion (ratio). +# When the ratio of free swap space is lower than each of these values, warning +# and critical alerts will be triggered respectively. +alertmanager_node_free_swap_warning_threshold_ratio: 0.25 +alertmanager_node_free_swap_critical_threshold_ratio: 0.1 + ############################################################################### # Exporter configuration From 643aa78a83c969720e34e21fbac79752359937e8 Mon Sep 17 00:00:00 2001 From: Seunghun Lee Date: Tue, 21 May 2024 10:51:36 +0100 Subject: [PATCH 2/2] Add releasenote for swap space monitoring --- ...erts-for-swap-availability-75e28ed7f913d1ec.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 releasenotes/notes/add-alerts-for-swap-availability-75e28ed7f913d1ec.yaml diff --git a/releasenotes/notes/add-alerts-for-swap-availability-75e28ed7f913d1ec.yaml b/releasenotes/notes/add-alerts-for-swap-availability-75e28ed7f913d1ec.yaml new file mode 100644 index 000000000..db5efb85c --- /dev/null +++ b/releasenotes/notes/add-alerts-for-swap-availability-75e28ed7f913d1ec.yaml @@ -0,0 +1,13 @@ +--- +features: + - | + Added two alerts (Warning and critical) that are triggered when the ratio + of (free_swap_sppace / total_swap_space) is below thresholds. + Each threshold can be modified by alterting value of + ``alertmanager_node_free_swap_warning_threshold_ratio`` and + ``alertmanager_node_free_swap_critical_threshold_ratio``. + + Currently this solution has limitation of having one-size fits all policy. + This can cause unwanted alerts for the hosts which utilise swap heavily + Therefore it is recommended to tune the thresholds or apply silence rules + for the needs.