diff --git a/src/prometheus_alert_rules/dcgm.yaml b/src/prometheus_alert_rules/dcgm.yaml index 229d0e48..2a498f70 100644 --- a/src/prometheus_alert_rules/dcgm.yaml +++ b/src/prometheus_alert_rules/dcgm.yaml @@ -5,50 +5,68 @@ groups: # isolate the least significant 8 bits with % 256 # check whether bit 7 (starts from bit 0) has been set with the >= 128 comparison expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 256 >= 128 - for: 3m + for: 5m labels: severity: warning annotations: summary: GPU Hardware Power Brake Slowdown throttling detected. (instance {{ $labels.Hostname }}) description: | - HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }}. + HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }} This is an indicator of: - External Power Brake Assertion being triggered (e.g. by the system power supply) - LABELS = {{ $labels }} + Throttle reasons (bitmask): {{ $value }} + LABELS = {{ $labels }} - alert: HWThermalThrottle # isolate the least significant 7 bits with % 128 # check whether bit 6 (starts from bit 0) has been set with the >= 64 comparison expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 128 >= 64 - for: 3m + for: 5m labels: severity: warning annotations: summary: GPU Hardware Thermal throttling detected. (instance {{ $labels.Hostname }}) description: | - HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }}. + HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: {{ $labels.gpu }} This is an indicator of: - Temperature being too high - LABELS = {{ $labels }} + Throttle reasons (bitmask): {{ $value }} + LABELS = {{ $labels }} - alert: SWThermalThrottle # isolate the least significant 6 bits with % 64 # check whether bit 5 (starts from bit 0) has been set with the >= 32 comparison expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 64 >= 32 - for: 3m + for: 5m labels: severity: warning annotations: summary: GPU Software Thermal throttling detected. (instance {{ $labels.Hostname }}) description: | - SW Thermal Slowdown is engaged on NVIDIA GPU: {{ $labels.gpu }}. + SW Thermal Slowdown is engaged on NVIDIA GPU: {{ $labels.gpu }} This is an indicator of: - Current GPU temperature above the GPU Max Operating Temperature - Current memory temperature above the Memory Max Operating Temperature - LABELS = {{ $labels }} + Throttle reasons (bitmask): {{ $value }} + LABELS = {{ $labels }} + - alert: SyncBoostThrottle + # isolate the least significant 5 bits with % 32 + # check whether bit 4 (starts from bit 0) has been set with the >= 16 comparison + expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 32 >= 16 + for: 5m + labels: + severity: warning + annotations: + summary: GPU Sync Boost throttling detected. (instance {{ $labels.Hostname }}) + description: | + This NVIDIA GPU: {{ $labels.gpu }} has been added to a Sync boost group with nvidia-smi or DCGM in order to maximize performance per watt. + All GPUs in the sync boost group will boost to the minimum possible clocks across the entire group. + Look at the throttle reasons for other GPUs in the system to see why those GPUs are holding this one at lower clocks. + Throttle reasons (bitmask): {{ $value }} + LABELS = {{ $labels }} - alert: HWSlowdownThrottle # isolate the least significant 4 bits with % 16 # check whether bit 3 (starts from bit 0) has been set with the >= 8 comparison expr: DCGM_FI_DEV_CLOCK_THROTTLE_REASONS % 16 >= 8 - for: 3m + for: 5m labels: severity: warning annotations: @@ -60,7 +78,8 @@ groups: - External Power Brake Assertion is triggered (e.g. by the system power supply) - Power draw is too high and Fast Trigger protection is reducing the clocks - May be also reported during PState or clock change - LABELS = {{ $labels }} + Throttle reasons (bitmask): {{ $value }} + LABELS = {{ $labels }} - alert: SWPowerThrottle # isolate the least significant 3 bits with % 8 # check whether bit 2 (starts from bit 0) has been set with the >= 4 comparison @@ -71,5 +90,6 @@ groups: annotations: summary: GPU Software Power throttling detected. (instance {{ $labels.Hostname }}) description: | - SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: {{ $labels.gpu }}. - LABELS = {{ $labels }} + SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: {{ $labels.gpu }} + Throttle reasons (bitmask): {{ $value }} + LABELS = {{ $labels }} diff --git a/tests/unit/test_alert_rules/test_dcgm.yaml b/tests/unit/test_alert_rules/test_dcgm.yaml index 0d3311f3..541cbe9c 100644 --- a/tests/unit/test_alert_rules/test_dcgm.yaml +++ b/tests/unit/test_alert_rules/test_dcgm.yaml @@ -19,12 +19,21 @@ tests: severity: warning exp_annotations: summary: GPU Hardware Power Brake Slowdown throttling detected. (instance ubuntu-0) + description: | + HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 0 + This is an indicator of: + - External Power Brake Assertion being triggered (e.g. by the system power supply) + Throttle reasons (bitmask): 128 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:0 Hostname:ubuntu-0] - eval_time: 5m alertname: HWThermalThrottle exp_alerts: [] - eval_time: 5m alertname: SWThermalThrottle exp_alerts: [] + - eval_time: 5m + alertname: SyncBoostThrottle + exp_alerts: [] - eval_time: 5m alertname: HWSlowdownThrottle exp_alerts: [] @@ -47,12 +56,21 @@ tests: severity: warning exp_annotations: summary: GPU Hardware Thermal throttling detected. (instance ubuntu-0) + description: | + HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 1 + This is an indicator of: + - Temperature being too high + Throttle reasons (bitmask): 64 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:1 Hostname:ubuntu-0] - eval_time: 5m alertname: HWPowerBrakeThrottle exp_alerts: [] - eval_time: 5m alertname: SWThermalThrottle exp_alerts: [] + - eval_time: 5m + alertname: SyncBoostThrottle + exp_alerts: [] - eval_time: 5m alertname: HWSlowdownThrottle exp_alerts: [] @@ -75,12 +93,22 @@ tests: severity: warning exp_annotations: summary: GPU Software Thermal throttling detected. (instance ubuntu-1) + description: | + SW Thermal Slowdown is engaged on NVIDIA GPU: 0 + This is an indicator of: + - Current GPU temperature above the GPU Max Operating Temperature + - Current memory temperature above the Memory Max Operating Temperature + Throttle reasons (bitmask): 32 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:0 Hostname:ubuntu-1] - eval_time: 5m alertname: HWPowerBrakeThrottle exp_alerts: [] - eval_time: 5m alertname: HWThermalThrottle exp_alerts: [] + - eval_time: 5m + alertname: SyncBoostThrottle + exp_alerts: [] - eval_time: 5m alertname: HWSlowdownThrottle exp_alerts: [] @@ -88,21 +116,27 @@ tests: alertname: SWPowerThrottle exp_alerts: [] -# HW Slowdown Throttle active +# Sync Boost Throttle active - interval: 1m input_series: - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-1", gpu="1"}' - values: '8 8 8 8 8' + values: '16 16 16 16 16' alert_rule_test: - eval_time: 5m - alertname: HWSlowdownThrottle + alertname: SyncBoostThrottle exp_alerts: - exp_labels: Hostname: ubuntu-1 gpu: 1 severity: warning exp_annotations: - summary: GPU Hardware Slowdown throttling detected. (instance ubuntu-1) + summary: GPU Sync Boost throttling detected. (instance ubuntu-1) + description: | + This NVIDIA GPU: 1 has been added to a Sync boost group with nvidia-smi or DCGM in order to maximize performance per watt. + All GPUs in the sync boost group will boost to the minimum possible clocks across the entire group. + Look at the throttle reasons for other GPUs in the system to see why those GPUs are holding this one at lower clocks. + Throttle reasons (bitmask): 16 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:1 Hostname:ubuntu-1] - eval_time: 5m alertname: HWPowerBrakeThrottle exp_alerts: [] @@ -112,14 +146,57 @@ tests: - eval_time: 5m alertname: SWThermalThrottle exp_alerts: [] + - eval_time: 5m + alertname: HWSlowdownThrottle + exp_alerts: [] - eval_time: 5m alertname: SWPowerThrottle exp_alerts: [] -# SW Power Throttle active +# HW Slowdown Throttle active - interval: 1m input_series: - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-2", gpu="0"}' + values: '8 8 8 8 8' + alert_rule_test: + - eval_time: 5m + alertname: HWSlowdownThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-2 + gpu: 0 + severity: warning + exp_annotations: + summary: GPU Hardware Slowdown throttling detected. (instance ubuntu-2) + description: | + HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 0 + This is an indicator of: + - Temperature being too high + - External Power Brake Assertion is triggered (e.g. by the system power supply) + - Power draw is too high and Fast Trigger protection is reducing the clocks + - May be also reported during PState or clock change + Throttle reasons (bitmask): 8 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:0 Hostname:ubuntu-2] + - eval_time: 5m + alertname: HWPowerBrakeThrottle + exp_alerts: [] + - eval_time: 5m + alertname: HWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SyncBoostThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWThermalThrottle + exp_alerts: [] + - eval_time: 5m + alertname: SWPowerThrottle + exp_alerts: [] + +# SW Power Throttle active +- interval: 1m + input_series: + - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-2", gpu="1"}' values: '4 4 4 4 4' alert_rule_test: - eval_time: 5m @@ -127,16 +204,23 @@ tests: exp_alerts: - exp_labels: Hostname: ubuntu-2 - gpu: 0 + gpu: 1 severity: warning exp_annotations: summary: GPU Software Power throttling detected. (instance ubuntu-2) + description: | + SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: 1 + Throttle reasons (bitmask): 4 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:1 Hostname:ubuntu-2] - eval_time: 5m alertname: HWPowerBrakeThrottle exp_alerts: [] - eval_time: 5m alertname: HWThermalThrottle exp_alerts: [] + - eval_time: 5m + alertname: SyncBoostThrottle + exp_alerts: [] - eval_time: 5m alertname: SWThermalThrottle exp_alerts: [] @@ -159,6 +243,9 @@ tests: - eval_time: 5m alertname: SWThermalThrottle exp_alerts: [] + - eval_time: 5m + alertname: SyncBoostThrottle + exp_alerts: [] - eval_time: 5m alertname: HWSlowdownThrottle exp_alerts: [] @@ -170,7 +257,7 @@ tests: - interval: 1m input_series: - series: 'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS{Hostname="ubuntu-3", gpu="2"}' - values: '255 255 255 255 255' + values: '511 511 511 511 511' alert_rule_test: - eval_time: 5m alertname: HWPowerBrakeThrottle @@ -179,6 +266,14 @@ tests: Hostname: ubuntu-3 gpu: 2 severity: warning + exp_annotations: + summary: GPU Hardware Power Brake Slowdown throttling detected. (instance ubuntu-3) + description: | + HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 2 + This is an indicator of: + - External Power Brake Assertion being triggered (e.g. by the system power supply) + Throttle reasons (bitmask): 511 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:2 Hostname:ubuntu-3] - eval_time: 5m alertname: HWThermalThrottle exp_alerts: @@ -186,6 +281,14 @@ tests: Hostname: ubuntu-3 gpu: 2 severity: warning + exp_annotations: + summary: GPU Hardware Thermal throttling detected. (instance ubuntu-3) + description: | + HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 2 + This is an indicator of: + - Temperature being too high + Throttle reasons (bitmask): 511 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:2 Hostname:ubuntu-3] - eval_time: 5m alertname: SWThermalThrottle exp_alerts: @@ -193,6 +296,30 @@ tests: Hostname: ubuntu-3 gpu: 2 severity: warning + exp_annotations: + summary: GPU Software Thermal throttling detected. (instance ubuntu-3) + description: | + SW Thermal Slowdown is engaged on NVIDIA GPU: 2 + This is an indicator of: + - Current GPU temperature above the GPU Max Operating Temperature + - Current memory temperature above the Memory Max Operating Temperature + Throttle reasons (bitmask): 511 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:2 Hostname:ubuntu-3] + - eval_time: 5m + alertname: SyncBoostThrottle + exp_alerts: + - exp_labels: + Hostname: ubuntu-3 + gpu: 2 + severity: warning + exp_annotations: + summary: GPU Sync Boost throttling detected. (instance ubuntu-3) + description: | + This NVIDIA GPU: 2 has been added to a Sync boost group with nvidia-smi or DCGM in order to maximize performance per watt. + All GPUs in the sync boost group will boost to the minimum possible clocks across the entire group. + Look at the throttle reasons for other GPUs in the system to see why those GPUs are holding this one at lower clocks. + Throttle reasons (bitmask): 511 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:2 Hostname:ubuntu-3] - eval_time: 5m alertname: HWSlowdownThrottle exp_alerts: @@ -200,6 +327,17 @@ tests: Hostname: ubuntu-3 gpu: 2 severity: warning + exp_annotations: + summary: GPU Hardware Slowdown throttling detected. (instance ubuntu-3) + description: | + HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 2 + This is an indicator of: + - Temperature being too high + - External Power Brake Assertion is triggered (e.g. by the system power supply) + - Power draw is too high and Fast Trigger protection is reducing the clocks + - May be also reported during PState or clock change + Throttle reasons (bitmask): 511 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:2 Hostname:ubuntu-3] - eval_time: 5m alertname: SWPowerThrottle exp_alerts: @@ -207,6 +345,12 @@ tests: Hostname: ubuntu-3 gpu: 2 severity: warning + exp_annotations: + summary: GPU Software Power throttling detected. (instance ubuntu-3) + description: | + SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: 2 + Throttle reasons (bitmask): 511 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:2 Hostname:ubuntu-3] # Multiple throttling reasons - interval: 1m @@ -221,6 +365,14 @@ tests: Hostname: ubuntu-0 gpu: 0 severity: warning + exp_annotations: + summary: GPU Hardware Power Brake Slowdown throttling detected. (instance ubuntu-0) + description: | + HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 0 + This is an indicator of: + - External Power Brake Assertion being triggered (e.g. by the system power supply) + Throttle reasons (bitmask): 196 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:0 Hostname:ubuntu-0] - eval_time: 5m alertname: HWThermalThrottle exp_alerts: @@ -228,6 +380,14 @@ tests: Hostname: ubuntu-0 gpu: 0 severity: warning + exp_annotations: + summary: GPU Hardware Thermal throttling detected. (instance ubuntu-0) + description: | + HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged on NVIDIA GPU: 0 + This is an indicator of: + - Temperature being too high + Throttle reasons (bitmask): 196 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:0 Hostname:ubuntu-0] - eval_time: 5m alertname: SWPowerThrottle exp_alerts: @@ -235,6 +395,15 @@ tests: Hostname: ubuntu-0 gpu: 0 severity: warning + exp_annotations: + summary: GPU Software Power throttling detected. (instance ubuntu-0) + description: | + SW Power Scaling algorithm is reducing the clocks below requested clocks on NVIDIA GPU: 0 + Throttle reasons (bitmask): 196 + LABELS = map[__name__:DCGM_FI_DEV_CLOCK_THROTTLE_REASONS gpu:0 Hostname:ubuntu-0] + - eval_time: 5m + alertname: SyncBoostThrottle + exp_alerts: [] - eval_time: 5m alertname: SWThermalThrottle exp_alerts: []