diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index d4776cf7c22..94b47aa2d26 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -1155,18 +1155,36 @@ groups: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing expr: | max by(cluster, namespace, instance) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 - for: 5m + for: 10m labels: severity: warning + - alert: MimirBlockBuilderNoCycleProcessing + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not processed cycles in the past hour. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing + expr: | + max by(cluster, namespace, instance) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 + for: 20m + labels: + severity: critical - alert: MimirBlockBuilderLagging annotations: - message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%. + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging expr: | max by(cluster, namespace, instance) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 for: 75m labels: severity: warning + - alert: MimirBlockBuilderLagging + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging + expr: | + max by(cluster, namespace, instance) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + for: 140m + labels: + severity: critical - alert: MimirBlockBuilderCompactAndUploadFailed annotations: message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks. @@ -1175,7 +1193,7 @@ groups: sum by (cluster, namespace, instance) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 for: 5m labels: - severity: warning + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 275dd27111d..78873bccec6 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -1169,18 +1169,36 @@ groups: runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing expr: | max by(cluster, namespace, pod) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 - for: 5m + for: 10m labels: severity: warning + - alert: MimirBlockBuilderNoCycleProcessing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not processed cycles in the past hour. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing + expr: | + max by(cluster, namespace, pod) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 + for: 20m + labels: + severity: critical - alert: MimirBlockBuilderLagging annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%. + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging expr: | max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 for: 75m labels: severity: warning + - alert: MimirBlockBuilderLagging + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging + expr: | + max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + for: 140m + labels: + severity: critical - alert: MimirBlockBuilderCompactAndUploadFailed annotations: message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks. @@ -1189,7 +1207,7 @@ groups: sum by (cluster, namespace, pod) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 for: 5m labels: - severity: warning + severity: critical - name: mimir_continuous_test rules: - alert: MimirContinuousTestNotRunningOnWrites diff --git a/operations/mimir-mixin/alerts/ingest-storage.libsonnet b/operations/mimir-mixin/alerts/ingest-storage.libsonnet index b223a0514b7..e920f6fcf50 100644 --- a/operations/mimir-mixin/alerts/ingest-storage.libsonnet +++ b/operations/mimir-mixin/alerts/ingest-storage.libsonnet @@ -230,7 +230,7 @@ // Alert if block-builder didn't process cycles in the past hour. { alert: $.alertName('BlockBuilderNoCycleProcessing'), - 'for': '5m', + 'for': '10m', expr: ||| max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 ||| % $._config, @@ -241,6 +241,19 @@ message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not processed cycles in the past hour.' % $._config, }, }, + { + alert: $.alertName('BlockBuilderNoCycleProcessing'), + 'for': '20m', + expr: ||| + max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0 + ||| % $._config, + labels: { + severity: 'critical', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not processed cycles in the past hour.' % $._config, + }, + }, // Alert if block-builder per partition lag is higher than the threshhold. // The value of the threshhold is arbitary large for now. We will reconsider this alert after we get the block-builder-scheduler. @@ -255,7 +268,20 @@ severity: 'warning', }, annotations: { - message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s reports partition lag of {{ printf "%%.2f" $value }}%%.' % $._config, + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s reports partition lag of {{ printf "%%.2f" $value }}.' % $._config, + }, + }, + { + alert: $.alertName('BlockBuilderLagging'), + 'for': '140m', // 2h20m. Indicating the lag did not come down for ~2 consumption cycles. + expr: ||| + max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6 + ||| % $._config, + labels: { + severity: 'critical', + }, + annotations: { + message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s reports partition lag of {{ printf "%%.2f" $value }}.' % $._config, }, }, @@ -267,7 +293,7 @@ sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0 ||| % $._config, labels: { - severity: 'warning', + severity: 'critical', }, annotations: { message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s fails to compact and upload blocks.' % $._config,