Skip to content

Commit

Permalink
Add separate dcgm alert for each throttle reason
Browse files Browse the repository at this point in the history
  • Loading branch information
Deezzir committed Oct 1, 2024
1 parent 36b9c56 commit 62fd448
Show file tree
Hide file tree
Showing 4 changed files with 308 additions and 39 deletions.
15 changes: 0 additions & 15 deletions src/prometheus_alert_rules/DCGM.yaml

This file was deleted.

65 changes: 65 additions & 0 deletions src/prometheus_alert_rules/dcgm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
groups:
- name: NVIDIA DCGM Throttling Alerts
rules:
- alert: HWPowerBrakeThrottle
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 128 != 0
for: 3m
labels:
severity: warning
annotations:
summary: GPU Hardware Power Brake Slowdown throttling detected. (instance {{ $labels.instance }})
description: |
HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}.
This is an indicator of:
- External Power Brake Assertion being triggered (e.g. by the system power supply)
LABELS = {{ $labels }}
- alert: HWThermalThrottle
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 64 != 0
for: 3m
labels:
severity: warning
annotations:
summary: GPU Hardware Thermal throttling detected. (instance {{ $labels.instance }})
description: |
HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}.
This is an indicator of:
- Temperature being too high
LABELS = {{ $labels }}
- alert: SWThermalThrottle
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 32 != 0
for: 3m
labels:
severity: warning
annotations:
summary: GPU Software Thermal throttling detected. (instance {{ $labels.instance }})
description: |
SW Thermal Slowdown is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}.
This is an indicator of:
- Current GPU temperature above the GPU Max Operating Temperature
- Current memory temperature above the Memory Max Operating Temperature
LABELS = {{ $labels }}
- alert: HWSlowdownThrottle
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 8 != 0
for: 3m
labels:
severity: warning
annotations:
summary: GPU Hardware Slowdown throttling detected. (instance {{ $labels.instance }})
description: |
HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}.
This is an indicator of:
- Temperature being too high
- External Power Brake Assertion is triggered (e.g. by the system power supply)
- Power draw is too high and Fast Trigger protection is reducing the clocks
- May be also reported during PState or clock change
LABELS = {{ $labels }}
- alert: SWPowerThrottle
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 4 != 0
for: 5m
labels:
severity: warning
annotations:
summary: GPU Software Power throttling detected. (instance {{ $labels.instance }})
description: |
SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: {{ $labels.nvidia_gpu }}.
LABELS = {{ $labels }}
24 changes: 0 additions & 24 deletions tests/unit/test_alert_rules/test_DCGM.yaml

This file was deleted.

243 changes: 243 additions & 0 deletions tests/unit/test_alert_rules/test_dcgm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
rule_files:
- ../../../src/prometheus_alert_rules/dcgm.yaml

evaluation_interval: 1m

tests:
# HW Power Brake Throttle active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}'
values: '128 128 128 128 128'
alert_rule_test:
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-0
nvidia_gpu: 0
severity: warning
exp_annotations:
summary: GPU Hardware Power Brake Slowdown throttling detected. (instance ubuntu-0)
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: SWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: HWSlowdownThrottle
exp_alerts: []
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts: []

# HW Thermal Throttle active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="1"}'
values: '64 64 64 64 64'
alert_rule_test:
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-0
nvidia_gpu: 1
severity: warning
exp_annotations:
summary: GPU Hardware Thermal throttling detected. (instance ubuntu-0)
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts: []
- eval_time: 5m
alertname: SWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: HWSlowdownThrottle
exp_alerts: []
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts: []

# SW Thermal Throttle active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-1", nvidia_gpu="0"}'
values: '32 32 32 32 32'
alert_rule_test:
- eval_time: 5m
alertname: SWThermalThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-1
nvidia_gpu: 0
severity: warning
exp_annotations:
summary: GPU Software Thermal throttling detected. (instance ubuntu-1)
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts: []
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: HWSlowdownThrottle
exp_alerts: []
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts: []

# HW Slowdown Throttle active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-1", nvidia_gpu="1"}'
values: '8 8 8 8 8'
alert_rule_test:
- eval_time: 5m
alertname: HWSlowdownThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-1
nvidia_gpu: 1
severity: warning
exp_annotations:
summary: GPU Hardware Slowdown throttling detected. (instance ubuntu-1)
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts: []
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: SWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts: []

# SW Power Throttle active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-2", nvidia_gpu="0"}'
values: '4 4 4 4 4'
alert_rule_test:
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-2
nvidia_gpu: 0
severity: warning
exp_annotations:
summary: GPU Software Power throttling detected. (instance ubuntu-2)
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts: []
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: SWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: HWSlowdownThrottle
exp_alerts: []

# No throttling
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}'
values: '0 0 0 0 0'
alert_rule_test:
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts: []
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: SWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: HWSlowdownThrottle
exp_alerts: []
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts: []

# All throttling reasons active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-3", nvidia_gpu="2"}'
values: '255 255 255 255 255'
alert_rule_test:
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-3
nvidia_gpu: 2
severity: warning
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-3
nvidia_gpu: 2
severity: warning
- eval_time: 5m
alertname: SWThermalThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-3
nvidia_gpu: 2
severity: warning
- eval_time: 5m
alertname: HWSlowdownThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-3
nvidia_gpu: 2
severity: warning
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-3
nvidia_gpu: 2
severity: warning

# Multiple throttling reasons
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}'
values: '196 196 196 196 196' # 128 + 64 + 4
alert_rule_test:
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-0
nvidia_gpu: 0
severity: warning
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-0
nvidia_gpu: 0
severity: warning
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-0
nvidia_gpu: 0
severity: warning
- eval_time: 5m
alertname: SWThermalThrottle
exp_alerts: []
- eval_time: 5m
alertname: HWSlowdownThrottle
exp_alerts: []

0 comments on commit 62fd448

Please sign in to comment.