From 10f2583794cc968e82aa8fe0b04ad9dc0f6e7e2c Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 13 Dec 2024 22:45:35 -0500 Subject: [PATCH 1/2] Consider physical blocksize for SmartHealthStatusFail alert --- src/prometheus_alert_rules/smart.yaml | 2 +- tests/unit/test_alert_rules/test_smart.yaml | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/prometheus_alert_rules/smart.yaml b/src/prometheus_alert_rules/smart.yaml index d4a31df..563bf77 100644 --- a/src/prometheus_alert_rules/smart.yaml +++ b/src/prometheus_alert_rules/smart.yaml @@ -33,7 +33,7 @@ groups: LABELS = {{ $labels }} - alert: SmartHealthStatusFail - expr: smartctl_device_smart_status == 0 + expr: (smartctl_device_smart_status == 0) and on(device, juju_unit) (smartctl_device_block_size{blocks_type="physical"} != 0) for: 2m labels: severity: critical diff --git a/tests/unit/test_alert_rules/test_smart.yaml b/tests/unit/test_alert_rules/test_smart.yaml index 5030e22..051f35e 100644 --- a/tests/unit/test_alert_rules/test_smart.yaml +++ b/tests/unit/test_alert_rules/test_smart.yaml @@ -68,6 +68,8 @@ tests: input_series: - series: 'smartctl_device_smart_status{device="sda", instance="ubuntu-1"}' values: '0x15' + - series: 'smartctl_device_block_size{device="sda", instance="ubuntu-1", blocks_type="physical"}' + values: '1x15' alert_rule_test: - eval_time: 10m @@ -84,6 +86,18 @@ tests: VALUE = 0 LABELS = map[__name__:smartctl_device_smart_status device:sda instance:ubuntu-1] + - interval: 1m + input_series: + - series: 'smartctl_device_smart_status{device="sda", instance="ubuntu-1"}' + values: '0x15' + - series: 'smartctl_device_block_size{device="sda", instance="ubuntu-1", blocks_type="physical"}' + values: '0x15' + + alert_rule_test: + - eval_time: 10m + alertname: SmartHealthStatusFail + exp_alerts: # alerts shouldn't fire since block size is 0 + - interval: 1m input_series: - series: 'smartctl_device_smartctl_exit_status{device="sda", instance="ubuntu-2"}' From f29248644856e40c04e16574d0cd955c73cbfa17 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Mon, 16 Dec 2024 18:59:12 -0500 Subject: [PATCH 2/2] Add comments --- src/prometheus_alert_rules/smart.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/prometheus_alert_rules/smart.yaml b/src/prometheus_alert_rules/smart.yaml index 563bf77..d7edcdf 100644 --- a/src/prometheus_alert_rules/smart.yaml +++ b/src/prometheus_alert_rules/smart.yaml @@ -33,6 +33,9 @@ groups: LABELS = {{ $labels }} - alert: SmartHealthStatusFail + # We can check if the physical size of the drive is 0, meaning it's a logical device, + # and ignore the status 0, which is always returned for logical devices, like HW RAID, + # avoiding false positives. expr: (smartctl_device_smart_status == 0) and on(device, juju_unit) (smartctl_device_block_size{blocks_type="physical"} != 0) for: 2m labels: