Skip to content

Commit

Permalink
[kube-prometheus-stack] allow override of for and severity rules (#4225)
Browse files Browse the repository at this point in the history
  • Loading branch information
ps-xaf authored Feb 22, 2024
1 parent 71bba71 commit 33697be
Show file tree
Hide file tree
Showing 23 changed files with 361 additions and 286 deletions.
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 56.8.2
version: 56.9.0
appVersion: v0.71.2
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down
7 changes: 7 additions & 0 deletions charts/kube-prometheus-stack/ci/03-non-defaults-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,10 @@ prometheus:
logFormat: json
additionalConfigString: |-
logLevel: {{ print "debug" | quote }}
customRules:
AlertmanagerFailedReload:
for: 3m
AlertmanagerMembersInconsistent:
for: 5m
severity: "warning"
59 changes: 59 additions & 0 deletions charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,63 @@ def add_custom_keep_firing_for(rules, indent=4):
return rules


def add_custom_for(rules, indent=4):
"""Add custom 'for:' condition in rules"""
replace_field = "for:"
rules = add_custom_alert_rules(rules, replace_field, indent)

return rules


def add_custom_severity(rules, indent=4):
"""Add custom 'severity:' condition in rules"""
replace_field = "severity:"
rules = add_custom_alert_rules(rules, replace_field, indent)

return rules


def add_custom_alert_rules(rules, key_to_replace, indent):
"""Extend alert field to allow custom values"""
key_to_replace_indented = ' ' * indent + key_to_replace
alertkey_field = '- alert:'
found_alert_key = False
alertname = None
updated_rules = ''

# pylint: disable=C0200
i = 0
while i < len(rules):
if rules[i:i + len(alertkey_field)] == alertkey_field:
found_alert_key = True
start_index_word_after = i + len(alertkey_field) + 1
end_index_alertkey_field = start_index_word_after
while end_index_alertkey_field < len(rules) and rules[end_index_alertkey_field].isalnum():
end_index_alertkey_field += 1

alertname = rules[start_index_word_after:end_index_alertkey_field]

if found_alert_key:
if rules[i:i + len(key_to_replace_indented)] == key_to_replace_indented:
found_alert_key = False
start_index_key_value = i + len(key_to_replace_indented) + 1
end_index_key_to_replace = start_index_key_value
while end_index_key_to_replace < len(rules) and rules[end_index_key_to_replace].isalnum():
end_index_key_to_replace += 1

word_after_key_to_replace = rules[start_index_key_value:end_index_key_to_replace]
new_key = key_to_replace_indented + ' {{ dig "' + alertname + \
'" "' + key_to_replace[:-1] + '" "' + \
word_after_key_to_replace + '" .Values.customRules }}'
updated_rules += new_key
i = end_index_key_to_replace

updated_rules += rules[i]
i += 1

return updated_rules


def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes):
fix_expr(group['rules'])
group_name = group['name']
Expand All @@ -423,6 +480,8 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes)
rules = add_custom_labels(rules, group)
rules = add_custom_annotations(rules, group)
rules = add_custom_keep_firing_for(rules)
rules = add_custom_for(rules)
rules = add_custom_severity(rules)
rules = add_rules_conditions_from_condition_map(rules)
rules = add_rules_per_rule_conditions(rules, group)
# initialize header
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ spec:
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0
for: 10m
for: {{ dig "AlertmanagerFailedReload" "for" "10m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerFailedReload" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -75,12 +75,12 @@ spec:
max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
< on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) group_left
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
for: 15m
for: {{ dig "AlertmanagerMembersInconsistent" "for" "15m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerMembersInconsistent" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -109,12 +109,12 @@ spec:
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
)
> 0.01
for: 5m
for: {{ dig "AlertmanagerFailedToSendAlerts" "for" "5m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
severity: {{ dig "AlertmanagerFailedToSendAlerts" "severity" "warning" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -143,12 +143,12 @@ spec:
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
)
> 0.01
for: 5m
for: {{ dig "AlertmanagerClusterFailedToSendAlerts" "for" "5m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerClusterFailedToSendAlerts" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -177,12 +177,12 @@ spec:
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
)
> 0.01
for: 5m
for: {{ dig "AlertmanagerClusterFailedToSendAlerts" "for" "5m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
severity: {{ dig "AlertmanagerClusterFailedToSendAlerts" "severity" "warning" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand All @@ -209,12 +209,12 @@ spec:
count_values by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
)
!= 1
for: 20m
for: {{ dig "AlertmanagerConfigInconsistent" "for" "20m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerConfigInconsistent" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -247,12 +247,12 @@ spec:
)
)
>= 0.5
for: 5m
for: {{ dig "AlertmanagerClusterDown" "for" "5m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerClusterDown" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -285,12 +285,12 @@ spec:
)
)
>= 0.5
for: 5m
for: {{ dig "AlertmanagerClusterCrashlooping" "for" "5m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerClusterCrashlooping" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ spec:
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/configreloadersidecarerrors
summary: config-reloader sidecar has not had a successful reload for 10m
expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
for: 10m
for: {{ dig "ConfigReloaderSidecarErrors" "for" "10m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
severity: {{ dig "ConfigReloaderSidecarErrors" "severity" "warning" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.configReloaders }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down
Loading

0 comments on commit 33697be

Please sign in to comment.