Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[kube-prometheus-stack] Allow to configure keep_firing_for for all Prometheus rules #3890

Merged
merged 2 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 51.6.1
version: 51.7.0
appVersion: v0.68.0
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down
23 changes: 23 additions & 0 deletions charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,28 @@ def add_custom_annotations(rules, group, indent=4):

return rules


def add_custom_keep_firing_for(rules, indent=4):
"""Add if wrapper for additional rules annotations"""
indent_spaces = " " * indent + " "
keep_firing_for = (indent_spaces + '{{- with .Values.defaultRules.keepFiringFor }}\n' +
indent_spaces + 'keep_firing_for: "{{ . }}"\n' +
indent_spaces + '{{- end }}')
keep_firing_for_len = len(keep_firing_for) + 1

separator = " " * indent + " for:.*"
alerts_positions = re.finditer(separator, rules)
alert = 0

for alert_position in alerts_positions:
# Add rule_condition after 'annotations:' statement
index = alert_position.end() + keep_firing_for_len * alert
rules = rules[:index] + "\n" + keep_firing_for + rules[index:]
alert += 1

return rules


def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes):
fix_expr(group['rules'])
group_name = group['name']
Expand All @@ -374,6 +396,7 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes)
# append per-alert rules
rules = add_custom_labels(rules, group)
rules = add_custom_annotations(rules, group)
rules = add_custom_keep_firing_for(rules)
rules = add_rules_conditions_from_condition_map(rules)
rules = add_rules_per_rule_conditions(rules, group)
# initialize header
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ spec:
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -73,6 +76,9 @@ spec:
< on (namespace,service,cluster) group_left
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -104,6 +110,9 @@ spec:
)
> 0.01
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -135,6 +144,9 @@ spec:
)
> 0.01
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -166,6 +178,9 @@ spec:
)
> 0.01
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -195,6 +210,9 @@ spec:
)
!= 1
for: 20m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -230,6 +248,9 @@ spec:
)
>= 0.5
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -265,6 +286,9 @@ spec:
)
>= 0.5
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ spec:
summary: config-reloader sidecar has not had a successful reload for 10m
expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.configReloaders }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ spec:
)
> 0
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -69,6 +72,9 @@ spec:
summary: etcd cluster has insufficient number of members.
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -93,6 +99,9 @@ spec:
summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -117,6 +126,9 @@ spec:
summary: etcd cluster has high number of leader changes.
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand Down Expand Up @@ -145,6 +157,9 @@ spec:
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand Down Expand Up @@ -173,6 +188,9 @@ spec:
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -199,6 +217,9 @@ spec:
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -225,6 +246,9 @@ spec:
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -249,6 +273,9 @@ spec:
summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -275,6 +302,9 @@ spec:
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -301,6 +331,9 @@ spec:
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 1
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -327,6 +360,9 @@ spec:
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -351,6 +387,9 @@ spec:
summary: etcd cluster database is running full.
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -375,6 +414,9 @@ spec:
summary: etcd cluster database growing very fast.
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -400,6 +442,9 @@ spec:
summary: etcd database size in use is less than 50% of the actual allocated storage.
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ spec:
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ spec:
and
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
for: 2m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
long: 1h
severity: critical
Expand Down Expand Up @@ -71,6 +74,9 @@ spec:
and
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
long: 6h
severity: critical
Expand Down Expand Up @@ -101,6 +107,9 @@ spec:
and
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
for: 1h
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
long: 1d
severity: warning
Expand Down Expand Up @@ -131,6 +140,9 @@ spec:
and
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
for: 3h
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
long: 3d
severity: warning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ spec:
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
Expand Down Expand Up @@ -71,6 +74,9 @@ spec:
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
Expand All @@ -96,6 +102,9 @@ spec:
summary: kube-state-metrics sharding is misconfigured.
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
Expand Down Expand Up @@ -125,6 +134,9 @@ spec:
sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
!= 0
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
Expand Down
Loading