From 62fd448f25c0bb457c3c82e246bfd10e02470d64 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 1 Oct 2024 15:37:22 -0400 Subject: [PATCH] Add separate dcgm alert for each throttle reason --- src/prometheus_alert_rules/DCGM.yaml | 15 -- src/prometheus_alert_rules/dcgm.yaml | 65 ++++++ tests/unit/test_alert_rules/test_DCGM.yaml | 24 -- tests/unit/test_alert_rules/test_dcgm.yaml | 243 +++++++++++++++++++++ 4 files changed, 308 insertions(+), 39 deletions(-) delete mode 100644 src/prometheus_alert_rules/DCGM.yaml create mode 100644 src/prometheus_alert_rules/dcgm.yaml delete mode 100644 tests/unit/test_alert_rules/test_DCGM.yaml create mode 100644 tests/unit/test_alert_rules/test_dcgm.yaml diff --git a/src/prometheus_alert_rules/DCGM.yaml b/src/prometheus_alert_rules/DCGM.yaml deleted file mode 100644 index f74c01dc..00000000 --- a/src/prometheus_alert_rules/DCGM.yaml +++ /dev/null @@ -1,15 +0,0 @@ -groups: -- name: DCGM - rules: - - alert: GpuClockThrottlingDetected - expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS != 0 - for: 5m - labels: - severity: warning - annotations: - summary: GPU clock throttling detected. (instance {{ $labels.instance }}) - description: | - GPU clock throttling has been detected on GPU {{ $labels.gpu }}. - Throttle reasons (bitmask): {{ $value }} - Refer to the DCGM documentation for more information: https://docs.nvidia.com/datacenter/dcgm/2.1/dcgm-api/group__dcgmFieldConstants.html - LABELS = {{ $labels }} diff --git a/src/prometheus_alert_rules/dcgm.yaml b/src/prometheus_alert_rules/dcgm.yaml new file mode 100644 index 00000000..0469c0c6 --- /dev/null +++ b/src/prometheus_alert_rules/dcgm.yaml @@ -0,0 +1,65 @@ +groups: +- name: NVIDIA DCGM Throttling Alerts + rules: + - alert: HWPowerBrakeThrottle + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 128 != 0 + for: 3m + labels: + severity: warning + annotations: + summary: GPU Hardware Power Brake Slowdown throttling detected. (instance {{ $labels.instance }}) + description: | + HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}. + This is an indicator of: + - External Power Brake Assertion being triggered (e.g. by the system power supply) + LABELS = {{ $labels }} + - alert: HWThermalThrottle + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 64 != 0 + for: 3m + labels: + severity: warning + annotations: + summary: GPU Hardware Thermal throttling detected. (instance {{ $labels.instance }}) + description: | + HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}. + This is an indicator of: + - Temperature being too high + LABELS = {{ $labels }} + - alert: SWThermalThrottle + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 32 != 0 + for: 3m + labels: + severity: warning + annotations: + summary: GPU Software Thermal throttling detected. (instance {{ $labels.instance }}) + description: | + SW Thermal Slowdown is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}. + This is an indicator of: + - Current GPU temperature above the GPU Max Operating Temperature + - Current memory temperature above the Memory Max Operating Temperature + LABELS = {{ $labels }} + - alert: HWSlowdownThrottle + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 8 != 0 + for: 3m + labels: + severity: warning + annotations: + summary: GPU Hardware Slowdown throttling detected. (instance {{ $labels.instance }}) + description: | + HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}. + This is an indicator of: + - Temperature being too high + - External Power Brake Assertion is triggered (e.g. by the system power supply) + - Power draw is too high and Fast Trigger protection is reducing the clocks + - May be also reported during PState or clock change + LABELS = {{ $labels }} + - alert: SWPowerThrottle + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 4 != 0 + for: 5m + labels: + severity: warning + annotations: + summary: GPU Software Power throttling detected. (instance {{ $labels.instance }}) + description: | + SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: {{ $labels.nvidia_gpu }}. + LABELS = {{ $labels }} diff --git a/tests/unit/test_alert_rules/test_DCGM.yaml b/tests/unit/test_alert_rules/test_DCGM.yaml deleted file mode 100644 index 0d2940ad..00000000 --- a/tests/unit/test_alert_rules/test_DCGM.yaml +++ /dev/null @@ -1,24 +0,0 @@ -rule_files: - - ../../../src/prometheus_alert_rules/DCGM.yaml - -tests: - - interval: 1m - input_series: - - series: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", gpu="0"} - values: '32x15' # Value indicating software thermal throttling (0x20) - - alert_rule_test: - - eval_time: 10m - alertname: GpuClockThrottlingDetected - exp_alerts: - - exp_labels: - severity: warning - instance: ubuntu-0 - gpu: "0" - exp_annotations: - summary: GPU clock throttling detected. (instance ubuntu-0) - description: | - GPU clock throttling has been detected on GPU 0. - Throttle reasons (bitmask): 32 - Refer to the DCGM documentation for more information: https://docs.nvidia.com/datacenter/dcgm/2.1/dcgm-api/group__dcgmFieldConstants.html - LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:0 instance:ubuntu-0] diff --git a/tests/unit/test_alert_rules/test_dcgm.yaml b/tests/unit/test_alert_rules/test_dcgm.yaml new file mode 100644 index 00000000..ee3662e5 --- /dev/null +++ b/tests/unit/test_alert_rules/test_dcgm.yaml @@ -0,0 +1,243 @@ +rule_files: + - ../../../src/prometheus_alert_rules/dcgm.yaml + +evaluation_interval: 1m + +tests: +# HW Power Brake Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}' + values: '128 128 128 128 128' + alert_rule_test: + - eval_time: 5m + alertname: HWPowerBrakeThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-0 + nvidia_gpu: 0 + severity: warning + exp_annotations: + summary: GPU Hardware Power Brake Slowdown throttling detected. (instance ubuntu-0) + - eval_time: 5m + alertname: HWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWSlowdownThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWPowerThrottle + exp_alerts: [] + +# HW Thermal Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="1"}' + values: '64 64 64 64 64' + alert_rule_test: + - eval_time: 5m + alertname: HWThermalThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-0 + nvidia_gpu: 1 + severity: warning + exp_annotations: + summary: GPU Hardware Thermal throttling detected. (instance ubuntu-0) + - eval_time: 5m + alertname: HWPowerBrakeThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWSlowdownThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWPowerThrottle + exp_alerts: [] + +# SW Thermal Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-1", nvidia_gpu="0"}' + values: '32 32 32 32 32' + alert_rule_test: + - eval_time: 5m + alertname: SWThermalThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-1 + nvidia_gpu: 0 + severity: warning + exp_annotations: + summary: GPU Software Thermal throttling detected. (instance ubuntu-1) + - eval_time: 5m + alertname: HWPowerBrakeThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWSlowdownThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWPowerThrottle + exp_alerts: [] + +# HW Slowdown Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-1", nvidia_gpu="1"}' + values: '8 8 8 8 8' + alert_rule_test: + - eval_time: 5m + alertname: HWSlowdownThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-1 + nvidia_gpu: 1 + severity: warning + exp_annotations: + summary: GPU Hardware Slowdown throttling detected. (instance ubuntu-1) + - eval_time: 5m + alertname: HWPowerBrakeThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWPowerThrottle + exp_alerts: [] + +# SW Power Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-2", nvidia_gpu="0"}' + values: '4 4 4 4 4' + alert_rule_test: + - eval_time: 5m + alertname: SWPowerThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-2 + nvidia_gpu: 0 + severity: warning + exp_annotations: + summary: GPU Software Power throttling detected. (instance ubuntu-2) + - eval_time: 5m + alertname: HWPowerBrakeThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWSlowdownThrottle + exp_alerts: [] + +# No throttling +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}' + values: '0 0 0 0 0' + alert_rule_test: + - eval_time: 5m + alertname: HWPowerBrakeThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWSlowdownThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWPowerThrottle + exp_alerts: [] + +# All throttling reasons active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-3", nvidia_gpu="2"}' + values: '255 255 255 255 255' + alert_rule_test: + - eval_time: 5m + alertname: HWPowerBrakeThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-3 + nvidia_gpu: 2 + severity: warning + - eval_time: 5m + alertname: HWThermalThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-3 + nvidia_gpu: 2 + severity: warning + - eval_time: 5m + alertname: SWThermalThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-3 + nvidia_gpu: 2 + severity: warning + - eval_time: 5m + alertname: HWSlowdownThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-3 + nvidia_gpu: 2 + severity: warning + - eval_time: 5m + alertname: SWPowerThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-3 + nvidia_gpu: 2 + severity: warning + +# Multiple throttling reasons +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}' + values: '196 196 196 196 196' # 128 + 64 + 4 + alert_rule_test: + - eval_time: 5m + alertname: HWPowerBrakeThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-0 + nvidia_gpu: 0 + severity: warning + - eval_time: 5m + alertname: HWThermalThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-0 + nvidia_gpu: 0 + severity: warning + - eval_time: 5m + alertname: SWPowerThrottle + exp_alerts: + - exp_labels: + instance: ubuntu-0 + nvidia_gpu: 0 + severity: warning + - eval_time: 5m + alertname: SWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWSlowdownThrottle + exp_alerts: []