Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[kube-prometheus-stack] allow override of for and severity rules #4225

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 56.8.2
version: 56.9.0
appVersion: v0.71.2
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down
7 changes: 7 additions & 0 deletions charts/kube-prometheus-stack/ci/03-non-defaults-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,10 @@ prometheus:
logFormat: json
additionalConfigString: |-
logLevel: {{ print "debug" | quote }}
customRules:
AlertmanagerFailedReload:
for: 3m
AlertmanagerMembersInconsistent:
for: 5m
severity: "warning"
59 changes: 59 additions & 0 deletions charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,63 @@ def add_custom_keep_firing_for(rules, indent=4):
return rules


def add_custom_for(rules, indent=4):
"""Add custom 'for:' condition in rules"""
replace_field = "for:"
rules = add_custom_alert_rules(rules, replace_field, indent)

return rules


def add_custom_severity(rules, indent=4):
"""Add custom 'severity:' condition in rules"""
replace_field = "severity:"
rules = add_custom_alert_rules(rules, replace_field, indent)

return rules


def add_custom_alert_rules(rules, key_to_replace, indent):
"""Extend alert field to allow custom values"""
key_to_replace_indented = ' ' * indent + key_to_replace
alertkey_field = '- alert:'
found_alert_key = False
alertname = None
updated_rules = ''

# pylint: disable=C0200
i = 0
while i < len(rules):
if rules[i:i + len(alertkey_field)] == alertkey_field:
found_alert_key = True
start_index_word_after = i + len(alertkey_field) + 1
end_index_alertkey_field = start_index_word_after
while end_index_alertkey_field < len(rules) and rules[end_index_alertkey_field].isalnum():
end_index_alertkey_field += 1

alertname = rules[start_index_word_after:end_index_alertkey_field]

if found_alert_key:
if rules[i:i + len(key_to_replace_indented)] == key_to_replace_indented:
found_alert_key = False
start_index_key_value = i + len(key_to_replace_indented) + 1
end_index_key_to_replace = start_index_key_value
while end_index_key_to_replace < len(rules) and rules[end_index_key_to_replace].isalnum():
end_index_key_to_replace += 1

word_after_key_to_replace = rules[start_index_key_value:end_index_key_to_replace]
new_key = key_to_replace_indented + ' {{ dig "' + alertname + \
'" "' + key_to_replace[:-1] + '" "' + \
word_after_key_to_replace + '" .Values.customRules }}'
updated_rules += new_key
i = end_index_key_to_replace

updated_rules += rules[i]
i += 1

return updated_rules


def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes):
fix_expr(group['rules'])
group_name = group['name']
Expand All @@ -423,6 +480,8 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes)
rules = add_custom_labels(rules, group)
rules = add_custom_annotations(rules, group)
rules = add_custom_keep_firing_for(rules)
rules = add_custom_for(rules)
rules = add_custom_severity(rules)
rules = add_rules_conditions_from_condition_map(rules)
rules = add_rules_per_rule_conditions(rules, group)
# initialize header
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ spec:
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0
for: 10m
for: {{ dig "AlertmanagerFailedReload" "for" "10m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerFailedReload" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -75,12 +75,12 @@ spec:
max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
< on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) group_left
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
for: 15m
for: {{ dig "AlertmanagerMembersInconsistent" "for" "15m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerMembersInconsistent" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -109,12 +109,12 @@ spec:
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
)
> 0.01
for: 5m
for: {{ dig "AlertmanagerFailedToSendAlerts" "for" "5m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
severity: {{ dig "AlertmanagerFailedToSendAlerts" "severity" "warning" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -143,12 +143,12 @@ spec:
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
)
> 0.01
for: 5m
for: {{ dig "AlertmanagerClusterFailedToSendAlerts" "for" "5m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerClusterFailedToSendAlerts" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -177,12 +177,12 @@ spec:
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
)
> 0.01
for: 5m
for: {{ dig "AlertmanagerClusterFailedToSendAlerts" "for" "5m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
severity: {{ dig "AlertmanagerClusterFailedToSendAlerts" "severity" "warning" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand All @@ -209,12 +209,12 @@ spec:
count_values by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
)
!= 1
for: 20m
for: {{ dig "AlertmanagerConfigInconsistent" "for" "20m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerConfigInconsistent" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -247,12 +247,12 @@ spec:
)
)
>= 0.5
for: 5m
for: {{ dig "AlertmanagerClusterDown" "for" "5m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerClusterDown" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down Expand Up @@ -285,12 +285,12 @@ spec:
)
)
>= 0.5
for: 5m
for: {{ dig "AlertmanagerClusterCrashlooping" "for" "5m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
severity: {{ dig "AlertmanagerClusterCrashlooping" "severity" "critical" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ spec:
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/configreloadersidecarerrors
summary: config-reloader sidecar has not had a successful reload for 10m
expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
for: 10m
for: {{ dig "ConfigReloaderSidecarErrors" "for" "10m" .Values.customRules }}
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
severity: {{ dig "ConfigReloaderSidecarErrors" "severity" "warning" .Values.customRules }}
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.configReloaders }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand Down
Loading
Loading