-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add separate dcgm alert for each throttle reason
- Loading branch information
Showing
4 changed files
with
308 additions
and
39 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
groups: | ||
- name: NVIDIA DCGM Throttling Alerts | ||
rules: | ||
- alert: HWPowerBrakeThrottle | ||
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 128 != 0 | ||
for: 3m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: GPU Hardware Power Brake Slowdown throttling detected. (instance {{ $labels.instance }}) | ||
description: | | ||
HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}. | ||
This is an indicator of: | ||
- External Power Brake Assertion being triggered (e.g. by the system power supply) | ||
LABELS = {{ $labels }} | ||
- alert: HWThermalThrottle | ||
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 64 != 0 | ||
for: 3m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: GPU Hardware Thermal throttling detected. (instance {{ $labels.instance }}) | ||
description: | | ||
HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}. | ||
This is an indicator of: | ||
- Temperature being too high | ||
LABELS = {{ $labels }} | ||
- alert: SWThermalThrottle | ||
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 32 != 0 | ||
for: 3m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: GPU Software Thermal throttling detected. (instance {{ $labels.instance }}) | ||
description: | | ||
SW Thermal Slowdown is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}. | ||
This is an indicator of: | ||
- Current GPU temperature above the GPU Max Operating Temperature | ||
- Current memory temperature above the Memory Max Operating Temperature | ||
LABELS = {{ $labels }} | ||
- alert: HWSlowdownThrottle | ||
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 8 != 0 | ||
for: 3m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: GPU Hardware Slowdown throttling detected. (instance {{ $labels.instance }}) | ||
description: | | ||
HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.nvidia_gpu }}. | ||
This is an indicator of: | ||
- Temperature being too high | ||
- External Power Brake Assertion is triggered (e.g. by the system power supply) | ||
- Power draw is too high and Fast Trigger protection is reducing the clocks | ||
- May be also reported during PState or clock change | ||
LABELS = {{ $labels }} | ||
- alert: SWPowerThrottle | ||
expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS & 4 != 0 | ||
for: 5m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: GPU Software Power throttling detected. (instance {{ $labels.instance }}) | ||
description: | | ||
SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: {{ $labels.nvidia_gpu }}. | ||
LABELS = {{ $labels }} |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,243 @@ | ||
rule_files: | ||
- ../../../src/prometheus_alert_rules/dcgm.yaml | ||
|
||
evaluation_interval: 1m | ||
|
||
tests: | ||
# HW Power Brake Throttle active | ||
- interval: 1m | ||
input_series: | ||
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}' | ||
values: '128 128 128 128 128' | ||
alert_rule_test: | ||
- eval_time: 5m | ||
alertname: HWPowerBrakeThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-0 | ||
nvidia_gpu: 0 | ||
severity: warning | ||
exp_annotations: | ||
summary: GPU Hardware Power Brake Slowdown throttling detected. (instance ubuntu-0) | ||
- eval_time: 5m | ||
alertname: HWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: SWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: HWSlowdownThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: SWPowerThrottle | ||
exp_alerts: [] | ||
|
||
# HW Thermal Throttle active | ||
- interval: 1m | ||
input_series: | ||
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="1"}' | ||
values: '64 64 64 64 64' | ||
alert_rule_test: | ||
- eval_time: 5m | ||
alertname: HWThermalThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-0 | ||
nvidia_gpu: 1 | ||
severity: warning | ||
exp_annotations: | ||
summary: GPU Hardware Thermal throttling detected. (instance ubuntu-0) | ||
- eval_time: 5m | ||
alertname: HWPowerBrakeThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: SWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: HWSlowdownThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: SWPowerThrottle | ||
exp_alerts: [] | ||
|
||
# SW Thermal Throttle active | ||
- interval: 1m | ||
input_series: | ||
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-1", nvidia_gpu="0"}' | ||
values: '32 32 32 32 32' | ||
alert_rule_test: | ||
- eval_time: 5m | ||
alertname: SWThermalThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-1 | ||
nvidia_gpu: 0 | ||
severity: warning | ||
exp_annotations: | ||
summary: GPU Software Thermal throttling detected. (instance ubuntu-1) | ||
- eval_time: 5m | ||
alertname: HWPowerBrakeThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: HWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: HWSlowdownThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: SWPowerThrottle | ||
exp_alerts: [] | ||
|
||
# HW Slowdown Throttle active | ||
- interval: 1m | ||
input_series: | ||
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-1", nvidia_gpu="1"}' | ||
values: '8 8 8 8 8' | ||
alert_rule_test: | ||
- eval_time: 5m | ||
alertname: HWSlowdownThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-1 | ||
nvidia_gpu: 1 | ||
severity: warning | ||
exp_annotations: | ||
summary: GPU Hardware Slowdown throttling detected. (instance ubuntu-1) | ||
- eval_time: 5m | ||
alertname: HWPowerBrakeThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: HWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: SWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: SWPowerThrottle | ||
exp_alerts: [] | ||
|
||
# SW Power Throttle active | ||
- interval: 1m | ||
input_series: | ||
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-2", nvidia_gpu="0"}' | ||
values: '4 4 4 4 4' | ||
alert_rule_test: | ||
- eval_time: 5m | ||
alertname: SWPowerThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-2 | ||
nvidia_gpu: 0 | ||
severity: warning | ||
exp_annotations: | ||
summary: GPU Software Power throttling detected. (instance ubuntu-2) | ||
- eval_time: 5m | ||
alertname: HWPowerBrakeThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: HWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: SWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: HWSlowdownThrottle | ||
exp_alerts: [] | ||
|
||
# No throttling | ||
- interval: 1m | ||
input_series: | ||
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}' | ||
values: '0 0 0 0 0' | ||
alert_rule_test: | ||
- eval_time: 5m | ||
alertname: HWPowerBrakeThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: HWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: SWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: HWSlowdownThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: SWPowerThrottle | ||
exp_alerts: [] | ||
|
||
# All throttling reasons active | ||
- interval: 1m | ||
input_series: | ||
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-3", nvidia_gpu="2"}' | ||
values: '255 255 255 255 255' | ||
alert_rule_test: | ||
- eval_time: 5m | ||
alertname: HWPowerBrakeThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-3 | ||
nvidia_gpu: 2 | ||
severity: warning | ||
- eval_time: 5m | ||
alertname: HWThermalThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-3 | ||
nvidia_gpu: 2 | ||
severity: warning | ||
- eval_time: 5m | ||
alertname: SWThermalThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-3 | ||
nvidia_gpu: 2 | ||
severity: warning | ||
- eval_time: 5m | ||
alertname: HWSlowdownThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-3 | ||
nvidia_gpu: 2 | ||
severity: warning | ||
- eval_time: 5m | ||
alertname: SWPowerThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-3 | ||
nvidia_gpu: 2 | ||
severity: warning | ||
|
||
# Multiple throttling reasons | ||
- interval: 1m | ||
input_series: | ||
- series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{instance="ubuntu-0", nvidia_gpu="0"}' | ||
values: '196 196 196 196 196' # 128 + 64 + 4 | ||
alert_rule_test: | ||
- eval_time: 5m | ||
alertname: HWPowerBrakeThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-0 | ||
nvidia_gpu: 0 | ||
severity: warning | ||
- eval_time: 5m | ||
alertname: HWThermalThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-0 | ||
nvidia_gpu: 0 | ||
severity: warning | ||
- eval_time: 5m | ||
alertname: SWPowerThrottle | ||
exp_alerts: | ||
- exp_labels: | ||
instance: ubuntu-0 | ||
nvidia_gpu: 0 | ||
severity: warning | ||
- eval_time: 5m | ||
alertname: SWThermalThrottle | ||
exp_alerts: [] | ||
- eval_time: 5m | ||
alertname: HWSlowdownThrottle | ||
exp_alerts: [] |