Skip to content

Commit

Permalink
Fix expr with some modulo logic
Browse files Browse the repository at this point in the history
  • Loading branch information
Deezzir committed Oct 1, 2024
1 parent 2770f80 commit 8416584
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 49 deletions.
40 changes: 25 additions & 15 deletions src/prometheus_alert_rules/dcgm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,64 +2,74 @@ groups:
- name: NVIDIA DCGM Throttling Alerts
rules:
- alert: HWPowerBrakeThrottle
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS and 128 != bool 0
# isolate the least significant 8 bits with % 256
# check whether bit 7 (starts from bit 0) has been set with the >= 128 comparison
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 256 >= 128
for: 3m
labels:
severity: warning
annotations:
summary: GPU Hardware Power Brake Slowdown throttling detected. (instance {{ $labels.instance }})
summary: GPU Hardware Power Brake Slowdown throttling detected. (instance {{ $labels.Hostname }})
description: |
HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}.
HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }}.
This is an indicator of:
- External Power Brake Assertion being triggered (e.g. by the system power supply)
LABELS = {{ $labels }}
- alert: HWThermalThrottle
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS and 64 != bool 0
# isolate the least significant 7 bits with % 128
# check whether bit 6 (starts from bit 0) has been set with the >= 64 comparison
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 128 >= 64
for: 3m
labels:
severity: warning
annotations:
summary: GPU Hardware Thermal throttling detected. (instance {{ $labels.instance }})
summary: GPU Hardware Thermal throttling detected. (instance {{ $labels.Hostname }})
description: |
HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}.
HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }}.
This is an indicator of:
- Temperature being too high
LABELS = {{ $labels }}
- alert: SWThermalThrottle
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS and 32 != bool 0
# isolate the least significant 6 bits with % 64
# check whether bit 5 (starts from bit 0) has been set with the >= 32 comparison
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 64 >= 32
for: 3m
labels:
severity: warning
annotations:
summary: GPU Software Thermal throttling detected. (instance {{ $labels.instance }})
summary: GPU Software Thermal throttling detected. (instance {{ $labels.Hostname }})
description: |
SW Thermal Slowdown is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}.
SW Thermal Slowdown is engaged on NVIDIA GPU: {{ $labels.gpu }}.
This is an indicator of:
- Current GPU temperature above the GPU Max Operating Temperature
- Current memory temperature above the Memory Max Operating Temperature
LABELS = {{ $labels }}
- alert: HWSlowdownThrottle
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS and 8 != bool 0
# isolate the least significant 4 bits with % 16
# check whether bit 3 (starts from bit 0) has been set with the >= 8 comparison
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 16 >= 8
for: 3m
labels:
severity: warning
annotations:
summary: GPU Hardware Slowdown throttling detected. (instance {{ $labels.instance }})
summary: GPU Hardware Slowdown throttling detected. (instance {{ $labels.Hostname }})
description: |
HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}.
HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }}.
This is an indicator of:
- Temperature being too high
- External Power Brake Assertion is triggered (e.g. by the system power supply)
- Power draw is too high and Fast Trigger protection is reducing the clocks
- May be also reported during PState or clock change
LABELS = {{ $labels }}
- alert: SWPowerThrottle
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS and 4 != bool 0
# isolate the least significant 3 bits with % 8
# check whether bit 2 (starts from bit 0) has been set with the >= 4 comparison
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 8 >= 4
for: 5m
labels:
severity: warning
annotations:
summary: GPU Software Power throttling detected. (instance {{ $labels.instance }})
summary: GPU Software Power throttling detected. (instance {{ $labels.Hostname }})
description: |
SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: {{ $labels.nvidia_gpu }}.
SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: {{ $labels.gpu }}.
LABELS = {{ $labels }}
68 changes: 34 additions & 34 deletions tests/unit/test_alert_rules/test_dcgm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@ tests:
# HW Power Brake Throttle active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}'
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-0", gpu="0"}'
values: '128 128 128 128 128'
alert_rule_test:
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-0
nvidia_gpu: 0
Hostname: ubuntu-0
gpu: 0
severity: warning
exp_annotations:
summary: GPU Hardware Power Brake Slowdown throttling detected. (instance ubuntu-0)
Expand All @@ -35,15 +35,15 @@ tests:
# HW Thermal Throttle active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="1"}'
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-0", gpu="1"}'
values: '64 64 64 64 64'
alert_rule_test:
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-0
nvidia_gpu: 1
Hostname: ubuntu-0
gpu: 1
severity: warning
exp_annotations:
summary: GPU Hardware Thermal throttling detected. (instance ubuntu-0)
Expand All @@ -63,15 +63,15 @@ tests:
# SW Thermal Throttle active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-1", nvidia_gpu="0"}'
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-1", gpu="0"}'
values: '32 32 32 32 32'
alert_rule_test:
- eval_time: 5m
alertname: SWThermalThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-1
nvidia_gpu: 0
Hostname: ubuntu-1
gpu: 0
severity: warning
exp_annotations:
summary: GPU Software Thermal throttling detected. (instance ubuntu-1)
Expand All @@ -91,15 +91,15 @@ tests:
# HW Slowdown Throttle active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-1", nvidia_gpu="1"}'
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-1", gpu="1"}'
values: '8 8 8 8 8'
alert_rule_test:
- eval_time: 5m
alertname: HWSlowdownThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-1
nvidia_gpu: 1
Hostname: ubuntu-1
gpu: 1
severity: warning
exp_annotations:
summary: GPU Hardware Slowdown throttling detected. (instance ubuntu-1)
Expand All @@ -119,15 +119,15 @@ tests:
# SW Power Throttle active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-2", nvidia_gpu="0"}'
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-2", gpu="0"}'
values: '4 4 4 4 4'
alert_rule_test:
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-2
nvidia_gpu: 0
Hostname: ubuntu-2
gpu: 0
severity: warning
exp_annotations:
summary: GPU Software Power throttling detected. (instance ubuntu-2)
Expand All @@ -147,7 +147,7 @@ tests:
# No throttling
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}'
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-0", gpu="0"}'
values: '0 0 0 0 0'
alert_rule_test:
- eval_time: 5m
Expand All @@ -169,71 +169,71 @@ tests:
# All throttling reasons active
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-3", nvidia_gpu="2"}'
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-3", gpu="2"}'
values: '255 255 255 255 255'
alert_rule_test:
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-3
nvidia_gpu: 2
Hostname: ubuntu-3
gpu: 2
severity: warning
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-3
nvidia_gpu: 2
Hostname: ubuntu-3
gpu: 2
severity: warning
- eval_time: 5m
alertname: SWThermalThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-3
nvidia_gpu: 2
Hostname: ubuntu-3
gpu: 2
severity: warning
- eval_time: 5m
alertname: HWSlowdownThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-3
nvidia_gpu: 2
Hostname: ubuntu-3
gpu: 2
severity: warning
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-3
nvidia_gpu: 2
Hostname: ubuntu-3
gpu: 2
severity: warning

# Multiple throttling reasons
- interval: 1m
input_series:
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}'
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-0", gpu="0"}'
values: '196 196 196 196 196' # 128 + 64 + 4
alert_rule_test:
- eval_time: 5m
alertname: HWPowerBrakeThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-0
nvidia_gpu: 0
Hostname: ubuntu-0
gpu: 0
severity: warning
- eval_time: 5m
alertname: HWThermalThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-0
nvidia_gpu: 0
Hostname: ubuntu-0
gpu: 0
severity: warning
- eval_time: 5m
alertname: SWPowerThrottle
exp_alerts:
- exp_labels:
instance: ubuntu-0
nvidia_gpu: 0
Hostname: ubuntu-0
gpu: 0
severity: warning
- eval_time: 5m
alertname: SWThermalThrottle
Expand Down

0 comments on commit 8416584

Please sign in to comment.