From 33697bec952f9325f917c3cfec01cd2e9d2f3d3a Mon Sep 17 00:00:00 2001 From: ps-xaf Date: Thu, 22 Feb 2024 13:15:53 +0100 Subject: [PATCH] [kube-prometheus-stack] allow override of for and severity rules (#4225) --- charts/kube-prometheus-stack/Chart.yaml | 2 +- .../ci/03-non-defaults-values.yaml | 7 ++ .../hack/sync_prometheus_rules.py | 59 ++++++++++++ .../rules-1.14/alertmanager.rules.yaml | 32 +++---- .../rules-1.14/config-reloaders.yaml | 4 +- .../templates/prometheus/rules-1.14/etcd.yaml | 60 ++++++------ .../prometheus/rules-1.14/general.rules.yaml | 8 +- .../rules-1.14/kube-apiserver-slos.yaml | 16 ++-- .../rules-1.14/kube-state-metrics.yaml | 16 ++-- .../rules-1.14/kubernetes-apps.yaml | 62 ++++++------ .../rules-1.14/kubernetes-resources.yaml | 32 +++---- .../rules-1.14/kubernetes-storage.yaml | 30 +++--- .../kubernetes-system-apiserver.yaml | 22 ++--- .../kubernetes-system-controller-manager.yaml | 4 +- .../kubernetes-system-kube-proxy.yaml | 6 +- .../rules-1.14/kubernetes-system-kubelet.yaml | 44 ++++----- .../kubernetes-system-scheduler.yaml | 4 +- .../rules-1.14/kubernetes-system.yaml | 8 +- .../prometheus/rules-1.14/node-exporter.yaml | 96 +++++++++---------- .../prometheus/rules-1.14/node-network.yaml | 4 +- .../rules-1.14/prometheus-operator.yaml | 32 +++---- .../prometheus/rules-1.14/prometheus.yaml | 90 ++++++++--------- charts/kube-prometheus-stack/values.yaml | 9 ++ 23 files changed, 361 insertions(+), 286 deletions(-) diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index ae8dbb37ca6f..49b45f92d057 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -23,7 +23,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 56.8.2 +version: 56.9.0 appVersion: v0.71.2 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus diff --git a/charts/kube-prometheus-stack/ci/03-non-defaults-values.yaml b/charts/kube-prometheus-stack/ci/03-non-defaults-values.yaml index c50ff240652c..0838274de5fa 100644 --- a/charts/kube-prometheus-stack/ci/03-non-defaults-values.yaml +++ b/charts/kube-prometheus-stack/ci/03-non-defaults-values.yaml @@ -33,3 +33,10 @@ prometheus: logFormat: json additionalConfigString: |- logLevel: {{ print "debug" | quote }} + +customRules: + AlertmanagerFailedReload: + for: 3m + AlertmanagerMembersInconsistent: + for: 5m + severity: "warning" diff --git a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py index a621b3d6c1ce..41128bf37543 100755 --- a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py +++ b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py @@ -406,6 +406,63 @@ def add_custom_keep_firing_for(rules, indent=4): return rules +def add_custom_for(rules, indent=4): + """Add custom 'for:' condition in rules""" + replace_field = "for:" + rules = add_custom_alert_rules(rules, replace_field, indent) + + return rules + + +def add_custom_severity(rules, indent=4): + """Add custom 'severity:' condition in rules""" + replace_field = "severity:" + rules = add_custom_alert_rules(rules, replace_field, indent) + + return rules + + +def add_custom_alert_rules(rules, key_to_replace, indent): + """Extend alert field to allow custom values""" + key_to_replace_indented = ' ' * indent + key_to_replace + alertkey_field = '- alert:' + found_alert_key = False + alertname = None + updated_rules = '' + + # pylint: disable=C0200 + i = 0 + while i < len(rules): + if rules[i:i + len(alertkey_field)] == alertkey_field: + found_alert_key = True + start_index_word_after = i + len(alertkey_field) + 1 + end_index_alertkey_field = start_index_word_after + while end_index_alertkey_field < len(rules) and rules[end_index_alertkey_field].isalnum(): + end_index_alertkey_field += 1 + + alertname = rules[start_index_word_after:end_index_alertkey_field] + + if found_alert_key: + if rules[i:i + len(key_to_replace_indented)] == key_to_replace_indented: + found_alert_key = False + start_index_key_value = i + len(key_to_replace_indented) + 1 + end_index_key_to_replace = start_index_key_value + while end_index_key_to_replace < len(rules) and rules[end_index_key_to_replace].isalnum(): + end_index_key_to_replace += 1 + + word_after_key_to_replace = rules[start_index_key_value:end_index_key_to_replace] + new_key = key_to_replace_indented + ' {{ dig "' + alertname + \ + '" "' + key_to_replace[:-1] + '" "' + \ + word_after_key_to_replace + '" .Values.customRules }}' + updated_rules += new_key + i = end_index_key_to_replace + + updated_rules += rules[i] + i += 1 + + return updated_rules + + def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes): fix_expr(group['rules']) group_name = group['name'] @@ -423,6 +480,8 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes) rules = add_custom_labels(rules, group) rules = add_custom_annotations(rules, group) rules = add_custom_keep_firing_for(rules) + rules = add_custom_for(rules) + rules = add_custom_severity(rules) rules = add_rules_conditions_from_condition_map(rules) rules = add_rules_per_rule_conditions(rules, group) # initialize header diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml index c306dc6c18ad..b262424d4aa4 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml @@ -42,12 +42,12 @@ spec: # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0 - for: 10m + for: {{ dig "AlertmanagerFailedReload" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerFailedReload" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -75,12 +75,12 @@ spec: max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) group_left count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])) - for: 15m + for: {{ dig "AlertmanagerMembersInconsistent" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerMembersInconsistent" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -109,12 +109,12 @@ spec: ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) ) > 0.01 - for: 5m + for: {{ dig "AlertmanagerFailedToSendAlerts" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "AlertmanagerFailedToSendAlerts" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -143,12 +143,12 @@ spec: ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) ) > 0.01 - for: 5m + for: {{ dig "AlertmanagerClusterFailedToSendAlerts" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerClusterFailedToSendAlerts" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -177,12 +177,12 @@ spec: ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) ) > 0.01 - for: 5m + for: {{ dig "AlertmanagerClusterFailedToSendAlerts" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "AlertmanagerClusterFailedToSendAlerts" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -209,12 +209,12 @@ spec: count_values by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) ) != 1 - for: 20m + for: {{ dig "AlertmanagerConfigInconsistent" "for" "20m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerConfigInconsistent" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -247,12 +247,12 @@ spec: ) ) >= 0.5 - for: 5m + for: {{ dig "AlertmanagerClusterDown" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerClusterDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -285,12 +285,12 @@ spec: ) ) >= 0.5 - for: 5m + for: {{ dig "AlertmanagerClusterCrashlooping" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerClusterCrashlooping" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml index 3c517306411c..72ebc4cc6dee 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml @@ -39,12 +39,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/configreloadersidecarerrors summary: config-reloader sidecar has not had a successful reload for 10m expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0 - for: 10m + for: {{ dig "ConfigReloaderSidecarErrors" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "ConfigReloaderSidecarErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.configReloaders }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml index a46495f71cbc..b7529604b14b 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml @@ -44,12 +44,12 @@ spec: ) ) > 0 - for: 10m + for: {{ dig "etcdMembersDown" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdMembersDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -71,12 +71,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' summary: etcd cluster has insufficient number of members. expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) - for: 3m + for: {{ dig "etcdInsufficientMembers" "for" "3m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdInsufficientMembers" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -98,12 +98,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.' summary: etcd cluster has no leader. expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 - for: 1m + for: {{ dig "etcdNoLeader" "for" "1m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdNoLeader" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -125,12 +125,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' summary: etcd cluster has high number of leader changes. expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 - for: 5m + for: {{ dig "etcdHighNumberOfLeaderChanges" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdHighNumberOfLeaderChanges" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -156,12 +156,12 @@ spec: / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 1 - for: 10m + for: {{ dig "etcdHighNumberOfFailedGRPCRequests" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdHighNumberOfFailedGRPCRequests" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -187,12 +187,12 @@ spec: / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5 - for: 5m + for: {{ dig "etcdHighNumberOfFailedGRPCRequests" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdHighNumberOfFailedGRPCRequests" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -216,12 +216,12 @@ spec: expr: |- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15 - for: 10m + for: {{ dig "etcdGRPCRequestsSlow" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdGRPCRequestsSlow" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -245,12 +245,12 @@ spec: expr: |- histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 - for: 10m + for: {{ dig "etcdMemberCommunicationSlow" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdMemberCommunicationSlow" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -272,12 +272,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' summary: etcd cluster has high number of proposal failures. expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 - for: 15m + for: {{ dig "etcdHighNumberOfFailedProposals" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdHighNumberOfFailedProposals" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -301,12 +301,12 @@ spec: expr: |- histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 - for: 10m + for: {{ dig "etcdHighFsyncDurations" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdHighFsyncDurations" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -330,12 +330,12 @@ spec: expr: |- histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 1 - for: 10m + for: {{ dig "etcdHighFsyncDurations" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdHighFsyncDurations" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -359,12 +359,12 @@ spec: expr: |- histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 - for: 10m + for: {{ dig "etcdHighCommitDurations" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdHighCommitDurations" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -386,12 +386,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' summary: etcd cluster database is running full. expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 - for: 10m + for: {{ dig "etcdDatabaseQuotaLowSpace" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdDatabaseQuotaLowSpace" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -413,12 +413,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.' summary: etcd cluster database growing very fast. expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"} - for: 10m + for: {{ dig "etcdExcessiveDatabaseGrowth" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdExcessiveDatabaseGrowth" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -441,12 +441,12 @@ spec: runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation summary: etcd database size in use is less than 50% of the actual allocated storage. expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600 - for: 10m + for: {{ dig "etcdDatabaseHighFragmentationRatio" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdDatabaseHighFragmentationRatio" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml index a62db1f06bc5..afdb1288dd13 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml @@ -37,12 +37,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/targetdown summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10 - for: 10m + for: {{ dig "TargetDown" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "TargetDown" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -76,7 +76,7 @@ spec: summary: An alert that should always be firing to certify that Alertmanager is working properly. expr: vector(1) labels: - severity: none + severity: {{ dig "Watchdog" "severity" "none" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -112,7 +112,7 @@ spec: summary: Info-level alert inhibition. expr: ALERTS{severity = "info"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 labels: - severity: none + severity: {{ dig "InfoInhibitor" "severity" "none" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml index 075bb9da1283..3f6a6a2426cc 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml @@ -40,13 +40,13 @@ spec: sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) and sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) - for: 2m + for: {{ dig "KubeAPIErrorBudgetBurn" "for" "2m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: long: 1h - severity: critical + severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "critical" .Values.customRules }} short: 5m {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }} {{- with .Values.defaultRules.additionalRuleLabels }} @@ -73,13 +73,13 @@ spec: sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) and sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) - for: 15m + for: {{ dig "KubeAPIErrorBudgetBurn" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: long: 6h - severity: critical + severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "critical" .Values.customRules }} short: 30m {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }} {{- with .Values.defaultRules.additionalRuleLabels }} @@ -106,13 +106,13 @@ spec: sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) and sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) - for: 1h + for: {{ dig "KubeAPIErrorBudgetBurn" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: long: 1d - severity: warning + severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "warning" .Values.customRules }} short: 2h {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }} {{- with .Values.defaultRules.additionalRuleLabels }} @@ -139,13 +139,13 @@ spec: sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) and sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) - for: 3h + for: {{ dig "KubeAPIErrorBudgetBurn" "for" "3h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: long: 3d - severity: warning + severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "warning" .Values.customRules }} short: 6h {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }} {{- with .Values.defaultRules.additionalRuleLabels }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml index 7471bd998343..93c6fe9331e0 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml @@ -42,12 +42,12 @@ spec: / sum(rate(kube_state_metrics_list_total{job="{{ $kubeStateMetricsJob }}"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0.01 - for: 15m + for: {{ dig "KubeStateMetricsListErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeStateMetricsListErrors" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -74,12 +74,12 @@ spec: / sum(rate(kube_state_metrics_watch_total{job="{{ $kubeStateMetricsJob }}"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0.01 - for: 15m + for: {{ dig "KubeStateMetricsWatchErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeStateMetricsWatchErrors" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -102,12 +102,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch summary: kube-state-metrics sharding is misconfigured. expr: stdvar (kube_state_metrics_total_shards{job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0 - for: 15m + for: {{ dig "KubeStateMetricsShardingMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeStateMetricsShardingMismatch" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -134,12 +134,12 @@ spec: - sum( 2 ^ max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="{{ $kubeStateMetricsJob }}"}) ) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0 - for: 15m + for: {{ dig "KubeStateMetricsShardsMissing" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeStateMetricsShardsMissing" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml index 48845fe7bc35..8582292a0012 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml @@ -39,12 +39,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping summary: Pod is crash looping. expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[5m]) >= 1 - for: 15m + for: {{ dig "KubePodCrashLooping" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubePodCrashLooping" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -74,12 +74,12 @@ spec: 1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) ) ) > 0 - for: 15m + for: {{ dig "KubePodNotReady" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubePodNotReady" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -105,12 +105,12 @@ spec: kube_deployment_status_observed_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != kube_deployment_metadata_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} - for: 15m + for: {{ dig "KubeDeploymentGenerationMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDeploymentGenerationMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -142,12 +142,12 @@ spec: == 0 ) - for: 15m + for: {{ dig "KubeDeploymentReplicasMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDeploymentReplicasMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -172,12 +172,12 @@ spec: expr: |- kube_deployment_status_condition{condition="Progressing", status="false",job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != 0 - for: 15m + for: {{ dig "KubeDeploymentRolloutStuck" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDeploymentRolloutStuck" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -209,12 +209,12 @@ spec: == 0 ) - for: 15m + for: {{ dig "KubeStatefulSetReplicasMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeStatefulSetReplicasMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -240,12 +240,12 @@ spec: kube_statefulset_status_observed_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != kube_statefulset_metadata_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} - for: 15m + for: {{ dig "KubeStatefulSetGenerationMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeStatefulSetGenerationMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -285,12 +285,12 @@ spec: == 0 ) - for: 15m + for: {{ dig "KubeStatefulSetUpdateNotRolledOut" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeStatefulSetUpdateNotRolledOut" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -336,12 +336,12 @@ spec: == 0 ) - for: 15m + for: {{ dig "KubeDaemonSetRolloutStuck" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDaemonSetRolloutStuck" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -364,12 +364,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting summary: Pod container waiting longer than 1 hour expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}) > 0 - for: 1h + for: {{ dig "KubeContainerWaiting" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeContainerWaiting" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -395,12 +395,12 @@ spec: kube_daemonset_status_desired_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} - kube_daemonset_status_current_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0 - for: 10m + for: {{ dig "KubeDaemonSetNotScheduled" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDaemonSetNotScheduled" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -423,12 +423,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled summary: DaemonSet pods are misscheduled. expr: kube_daemonset_status_number_misscheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0 - for: 15m + for: {{ dig "KubeDaemonSetMisScheduled" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDaemonSetMisScheduled" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -455,7 +455,7 @@ spec: and kube_job_status_active{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0) > 43200 labels: - severity: warning + severity: {{ dig "KubeJobNotCompleted" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -478,12 +478,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed summary: Job failed to complete. expr: kube_job_failed{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0 - for: 15m + for: {{ dig "KubeJobFailed" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeJobFailed" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -519,12 +519,12 @@ spec: kube_horizontalpodautoscaler_spec_max_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}) and changes(kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[15m]) == 0 - for: 15m + for: {{ dig "KubeHpaReplicasMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeHpaReplicasMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -550,12 +550,12 @@ spec: kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} == kube_horizontalpodautoscaler_spec_max_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} - for: 15m + for: {{ dig "KubeHpaMaxedOut" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeHpaMaxedOut" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml index f0e49dc3b131..3eb2be423f30 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml @@ -41,12 +41,12 @@ spec: sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="{{ $kubeStateMetricsJob }}",}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 and (sum(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 - for: 10m + for: {{ dig "KubeCPUOvercommit" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeCPUOvercommit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -72,12 +72,12 @@ spec: sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 and (sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 - for: 10m + for: {{ dig "KubeMemoryOvercommit" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeMemoryOvercommit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -104,12 +104,12 @@ spec: / sum(kube_node_status_allocatable{resource="cpu", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) > 1.5 - for: 5m + for: {{ dig "KubeCPUQuotaOvercommit" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeCPUQuotaOvercommit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -136,12 +136,12 @@ spec: / sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) > 1.5 - for: 5m + for: {{ dig "KubeMemoryQuotaOvercommit" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeMemoryQuotaOvercommit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -168,12 +168,12 @@ spec: / ignoring(instance, job, type) (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0) > 0.9 < 1 - for: 15m + for: {{ dig "KubeQuotaAlmostFull" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: info + severity: {{ dig "KubeQuotaAlmostFull" "severity" "info" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -200,12 +200,12 @@ spec: / ignoring(instance, job, type) (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0) == 1 - for: 15m + for: {{ dig "KubeQuotaFullyUsed" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: info + severity: {{ dig "KubeQuotaFullyUsed" "severity" "info" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -232,12 +232,12 @@ spec: / ignoring(instance, job, type) (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0) > 1 - for: 15m + for: {{ dig "KubeQuotaExceeded" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeQuotaExceeded" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -264,12 +264,12 @@ spec: / sum(increase(container_cpu_cfs_periods_total{}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, container, pod, namespace) > ( 25 / 100 ) - for: 15m + for: {{ dig "CPUThrottlingHigh" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: info + severity: {{ dig "CPUThrottlingHigh" "severity" "info" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml index 1927c7ad4ea1..dfb99607d20f 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml @@ -35,7 +35,7 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }} {{- end }} - description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free. + description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: |- @@ -50,12 +50,12 @@ spec: kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1m + for: {{ dig "KubePersistentVolumeFillingUp" "for" "1m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubePersistentVolumeFillingUp" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -74,7 +74,7 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }} {{- end }} - description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available. + description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: |- @@ -91,12 +91,12 @@ spec: kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1h + for: {{ dig "KubePersistentVolumeFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubePersistentVolumeFillingUp" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -115,7 +115,7 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }} {{- end }} - description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes. + description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup summary: PersistentVolumeInodes are filling up. expr: |- @@ -130,12 +130,12 @@ spec: kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1m + for: {{ dig "KubePersistentVolumeInodesFillingUp" "for" "1m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubePersistentVolumeInodesFillingUp" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -154,7 +154,7 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }} {{- end }} - description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free. + description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup summary: PersistentVolumeInodes are filling up. expr: |- @@ -171,12 +171,12 @@ spec: kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1h + for: {{ dig "KubePersistentVolumeInodesFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubePersistentVolumeInodesFillingUp" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -195,16 +195,16 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }} {{- end }} - description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. + description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors summary: PersistentVolume is having issues with provisioning. expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="{{ $kubeStateMetricsJob }}"} > 0 - for: 5m + for: {{ dig "KubePersistentVolumeErrors" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubePersistentVolumeErrors" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml index 30601baa59e5..3e2d9c69fbcc 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml @@ -37,12 +37,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job) histogram_quantile(0.01, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 - for: 5m + for: {{ dig "KubeClientCertificateExpiration" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeClientCertificateExpiration" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -65,12 +65,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job) histogram_quantile(0.01, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 - for: 5m + for: {{ dig "KubeClientCertificateExpiration" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeClientCertificateExpiration" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -94,7 +94,7 @@ spec: summary: Kubernetes aggregated API has reported errors. expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 labels: - severity: warning + severity: {{ dig "KubeAggregatedAPIErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -117,12 +117,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapidown summary: Kubernetes aggregated API is down. expr: (1 - max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85 - for: 5m + for: {{ dig "KubeAggregatedAPIDown" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeAggregatedAPIDown" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -146,12 +146,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="apiserver"} == 1) - for: 15m + for: {{ dig "KubeAPIDown" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeAPIDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -175,12 +175,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests summary: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests. expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 - for: 5m + for: {{ dig "KubeAPITerminatedRequests" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeAPITerminatedRequests" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml index 8c8d94379c83..e24bcac0e67e 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml @@ -38,12 +38,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontrollermanagerdown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-controller-manager"} == 1) - for: 15m + for: {{ dig "KubeControllerManagerDown" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeControllerManagerDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeControllerManager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml index f52f36d4f63b..90fc75caff8c 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml @@ -39,12 +39,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeproxydown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-proxy"} == 1) - for: 15m + for: {{ dig "KubeProxyDown" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeProxyDown" "labelsSeverity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeProxy }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -54,4 +54,4 @@ spec: {{- end }} {{- end }} {{- end }} -{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml index 75efdd647171..b71e86607877 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml @@ -38,12 +38,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready summary: Node is not ready. expr: kube_node_status_condition{job="{{ $kubeStateMetricsJob }}",condition="Ready",status="true"} == 0 - for: 15m + for: {{ dig "KubeNodeNotReady" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeNodeNotReady" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -66,12 +66,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable summary: Node is unreachable. expr: (kube_node_spec_taint{job="{{ $kubeStateMetricsJob }}",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="{{ $kubeStateMetricsJob }}",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 - for: 15m + for: {{ dig "KubeNodeUnreachable" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeNodeUnreachable" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -101,12 +101,12 @@ spec: max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) ( kube_node_status_capacity{job="{{ $kubeStateMetricsJob }}",resource="pods"} != 1 ) > 0.95 - for: 15m + for: {{ dig "KubeletTooManyPods" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: info + severity: {{ dig "KubeletTooManyPods" "severity" "info" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -129,12 +129,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping summary: Node readiness status is flapping. expr: sum(changes(kube_node_status_condition{job="{{ $kubeStateMetricsJob }}",status="true",condition="Ready"}[15m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) > 2 - for: 15m + for: {{ dig "KubeNodeReadinessFlapping" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeNodeReadinessFlapping" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -157,12 +157,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 - for: 5m + for: {{ dig "KubeletPlegDurationHigh" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeletPlegDurationHigh" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -185,12 +185,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh summary: Kubelet Pod startup latency is too high. expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le)) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 - for: 15m + for: {{ dig "KubeletPodStartUpLatencyHigh" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeletPodStartUpLatencyHigh" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -214,7 +214,7 @@ spec: summary: Kubelet client certificate is about to expire. expr: kubelet_certificate_manager_client_ttl_seconds < 604800 labels: - severity: warning + severity: {{ dig "KubeletClientCertificateExpiration" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -238,7 +238,7 @@ spec: summary: Kubelet client certificate is about to expire. expr: kubelet_certificate_manager_client_ttl_seconds < 86400 labels: - severity: critical + severity: {{ dig "KubeletClientCertificateExpiration" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -262,7 +262,7 @@ spec: summary: Kubelet server certificate is about to expire. expr: kubelet_certificate_manager_server_ttl_seconds < 604800 labels: - severity: warning + severity: {{ dig "KubeletServerCertificateExpiration" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -286,7 +286,7 @@ spec: summary: Kubelet server certificate is about to expire. expr: kubelet_certificate_manager_server_ttl_seconds < 86400 labels: - severity: critical + severity: {{ dig "KubeletServerCertificateExpiration" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -309,12 +309,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors summary: Kubelet has failed to renew its client certificate. expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 - for: 15m + for: {{ dig "KubeletClientCertificateRenewalErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeletClientCertificateRenewalErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -337,12 +337,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors summary: Kubelet has failed to renew its server certificate. expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0 - for: 15m + for: {{ dig "KubeletServerCertificateRenewalErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeletServerCertificateRenewalErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -366,12 +366,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1) - for: 15m + for: {{ dig "KubeletDown" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeletDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml index 7ca3c9b9fdb7..4fcae45422df 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml @@ -38,12 +38,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeschedulerdown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-scheduler"} == 1) - for: 15m + for: {{ dig "KubeSchedulerDown" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeSchedulerDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml index a32747686e05..362580b72df5 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml @@ -37,12 +37,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch summary: Different semantic versions of Kubernetes components running. expr: count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 - for: 15m + for: {{ dig "KubeVersionMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeVersionMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -69,12 +69,12 @@ spec: / sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, job, namespace)) > 0.01 - for: 15m + for: {{ dig "KubeClientErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeClientErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml index 34d7d833cbe2..25b2b68c840a 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml @@ -44,12 +44,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemSpaceFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeFilesystemSpaceFillingUp" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -79,12 +79,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemSpaceFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeFilesystemSpaceFillingUp" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -112,12 +112,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 30m + for: {{ dig "NodeFilesystemAlmostOutOfSpace" "for" "30m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeFilesystemAlmostOutOfSpace" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -145,12 +145,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 30m + for: {{ dig "NodeFilesystemAlmostOutOfSpace" "for" "30m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeFilesystemAlmostOutOfSpace" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -180,12 +180,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemFilesFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeFilesystemFilesFillingUp" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -215,12 +215,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemFilesFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeFilesystemFilesFillingUp" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -248,12 +248,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemAlmostOutOfFiles" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeFilesystemAlmostOutOfFiles" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -281,12 +281,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemAlmostOutOfFiles" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeFilesystemAlmostOutOfFiles" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -309,12 +309,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs summary: Network interface is reporting many receive errors. expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01 - for: 1h + for: {{ dig "NodeNetworkReceiveErrs" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeNetworkReceiveErrs" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -337,12 +337,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs summary: Network interface is reporting many transmit errors. expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01 - for: 1h + for: {{ dig "NodeNetworkTransmitErrs" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeNetworkTransmitErrs" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -366,7 +366,7 @@ spec: summary: Number of conntrack are getting close to the limit. expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75 labels: - severity: warning + severity: {{ dig "NodeHighNumberConntrackEntriesUsed" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -390,7 +390,7 @@ spec: summary: Node Exporter text file collector failed to scrape. expr: node_textfile_scrape_error{job="node-exporter"} == 1 labels: - severity: warning + severity: {{ dig "NodeTextFileCollectorScrapeError" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -424,12 +424,12 @@ spec: and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) - for: 10m + for: {{ dig "NodeClockSkewDetected" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeClockSkewDetected" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -455,12 +455,12 @@ spec: min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 - for: 10m + for: {{ dig "NodeClockNotSynchronising" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeClockNotSynchronising" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -483,12 +483,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded summary: RAID Array is degraded. expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 - for: 15m + for: {{ dig "NodeRAIDDegraded" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeRAIDDegraded" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -512,7 +512,7 @@ spec: summary: Failed device in RAID array. expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0 labels: - severity: warning + severity: {{ dig "NodeRAIDDiskFailure" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -538,12 +538,12 @@ spec: ( node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 ) - for: 15m + for: {{ dig "NodeFileDescriptorLimit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeFileDescriptorLimit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -569,12 +569,12 @@ spec: ( node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 ) - for: 15m + for: {{ dig "NodeFileDescriptorLimit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeFileDescriptorLimit" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -599,12 +599,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodecpuhighusage summary: High CPU usage. expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90 - for: 15m + for: {{ dig "NodeCPUHighUsage" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: info + severity: {{ dig "NodeCPUHighUsage" "severity" "info" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -633,12 +633,12 @@ spec: expr: |- node_load1{job="node-exporter"} / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 - for: 15m + for: {{ dig "NodeSystemSaturation" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeSystemSaturation" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -665,12 +665,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodememorymajorpagesfaults summary: Memory major page faults are occurring at very high rate. expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 - for: 15m + for: {{ dig "NodeMemoryMajorPagesFaults" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeMemoryMajorPagesFaults" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -695,12 +695,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodememoryhighutilization summary: Host is running out of memory. expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m + for: {{ dig "NodeMemoryHighUtilization" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeMemoryHighUtilization" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -719,7 +719,7 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }} {{- end }} - description: 'Disk IO queue (aqu-sq) is high on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}}, has been above 10 for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}. + description: 'Disk IO queue (aqu-sq) is high on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}}, has been above 10 for the last 30 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}. This symptom might indicate disk saturation. @@ -727,12 +727,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodediskiosaturation summary: Disk IO queue is high. expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10 - for: 30m + for: {{ dig "NodeDiskIOSaturation" "for" "30m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeDiskIOSaturation" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -755,12 +755,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodesystemdservicefailed summary: Systemd service has entered failed state. expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m + for: {{ dig "NodeSystemdServiceFailed" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeSystemdServiceFailed" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -783,12 +783,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodebondingdegraded summary: Bonding interface is degraded expr: (node_bonding_slaves - node_bonding_active) != 0 - for: 5m + for: {{ dig "NodeBondingDegraded" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeBondingDegraded" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml index bc390506fb59..ecef04f22e20 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml @@ -37,12 +37,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/nodenetworkinterfaceflapping summary: Network interface is often changing its status expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 - for: 2m + for: {{ dig "NodeNetworkInterfaceFlapping" "for" "2m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeNetworkInterfaceFlapping" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.network }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml index a26196df632e..bd7d97c236b1 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml @@ -39,12 +39,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorlisterrors summary: Errors while performing list operations in controller. expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4 - for: 15m + for: {{ dig "PrometheusOperatorListErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorListErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -67,12 +67,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorwatcherrors summary: Errors while performing watch operations in controller. expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m])) / sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.4 - for: 15m + for: {{ dig "PrometheusOperatorWatchErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorWatchErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -95,12 +95,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorsyncfailed summary: Last controller reconciliation failed expr: min_over_time(prometheus_operator_syncs{status="failed",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m + for: {{ dig "PrometheusOperatorSyncFailed" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorSyncFailed" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -123,12 +123,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorreconcileerrors summary: Errors while reconciling objects. expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1 - for: 10m + for: {{ dig "PrometheusOperatorReconcileErrors" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorReconcileErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -151,12 +151,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorstatusupdateerrors summary: Errors while updating objects status. expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1 - for: 10m + for: {{ dig "PrometheusOperatorStatusUpdateErrors" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorStatusUpdateErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -179,12 +179,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornodelookuperrors summary: Errors while reconciling Prometheus. expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1 - for: 10m + for: {{ dig "PrometheusOperatorNodeLookupErrors" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorNodeLookupErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -207,12 +207,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornotready summary: Prometheus operator not ready expr: min by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) == 0) - for: 5m + for: {{ dig "PrometheusOperatorNotReady" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorNotReady" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -235,12 +235,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorrejectedresources summary: Resources rejected by Prometheus operator expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 5m + for: {{ dig "PrometheusOperatorRejectedResources" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorRejectedResources" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml index 48cfc7a449a2..907f7b30e736 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml @@ -42,12 +42,12 @@ spec: # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) == 0 - for: 10m + for: {{ dig "PrometheusBadConfig" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusBadConfig" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -70,12 +70,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheussdrefreshfailure summary: Failed Prometheus SD refresh. expr: increase(prometheus_sd_refresh_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[10m]) > 0 - for: 20m + for: {{ dig "PrometheusSDRefreshFailure" "for" "20m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusSDRefreshFailure" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -105,12 +105,12 @@ spec: > min_over_time(prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) ) - for: 15m + for: {{ dig "PrometheusNotificationQueueRunningFull" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusNotificationQueueRunningFull" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -140,12 +140,12 @@ spec: ) * 100 > 1 - for: 15m + for: {{ dig "PrometheusErrorSendingAlertsToSomeAlertmanagers" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusErrorSendingAlertsToSomeAlertmanagers" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -171,12 +171,12 @@ spec: # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) < 1 - for: 10m + for: {{ dig "PrometheusNotConnectedToAlertmanagers" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusNotConnectedToAlertmanagers" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -199,12 +199,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustsdbreloadsfailing summary: Prometheus has issues reloading blocks from disk. expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 - for: 4h + for: {{ dig "PrometheusTSDBReloadsFailing" "for" "4h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusTSDBReloadsFailing" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -227,12 +227,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustsdbcompactionsfailing summary: Prometheus has issues compacting blocks. expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 - for: 4h + for: {{ dig "PrometheusTSDBCompactionsFailing" "for" "4h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusTSDBCompactionsFailing" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -256,7 +256,7 @@ spec: summary: Prometheus is not ingesting samples. expr: |- ( - rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 + sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) <= 0 and ( sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0 @@ -264,12 +264,12 @@ spec: sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0 ) ) - for: 10m + for: {{ dig "PrometheusNotIngestingSamples" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusNotIngestingSamples" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -292,12 +292,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusduplicatetimestamps summary: Prometheus is dropping samples with duplicate timestamps. expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m + for: {{ dig "PrometheusDuplicateTimestamps" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusDuplicateTimestamps" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -320,12 +320,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusoutofordertimestamps summary: Prometheus drops samples with out-of-order timestamps. expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m + for: {{ dig "PrometheusOutOfOrderTimestamps" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOutOfOrderTimestamps" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -359,12 +359,12 @@ spec: ) * 100 > 1 - for: 15m + for: {{ dig "PrometheusRemoteStorageFailures" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusRemoteStorageFailures" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -395,12 +395,12 @@ spec: max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) ) > 120 - for: 15m + for: {{ dig "PrometheusRemoteWriteBehind" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusRemoteWriteBehind" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -430,12 +430,12 @@ spec: > max_over_time(prometheus_remote_storage_shards_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) ) - for: 15m + for: {{ dig "PrometheusRemoteWriteDesiredShards" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusRemoteWriteDesiredShards" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -458,12 +458,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusrulefailures summary: Prometheus is failing rule evaluations. expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusRuleFailures" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusRuleFailures" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -486,12 +486,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusmissingruleevaluations summary: Prometheus is missing rule evaluations due to slow rule group evaluation. expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusMissingRuleEvaluations" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusMissingRuleEvaluations" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -514,12 +514,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustargetlimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusTargetLimitHit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusTargetLimitHit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -542,12 +542,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuslabellimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusLabelLimitHit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusLabelLimitHit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -570,12 +570,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusscrapebodysizelimithit summary: Prometheus has dropped some targets that exceeded body size limit. expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusScrapeBodySizeLimitHit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusScrapeBodySizeLimitHit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -598,12 +598,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusscrapesamplelimithit summary: Prometheus has failed scrapes that have exceeded the configured sample limit. expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusScrapeSampleLimitHit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusScrapeSampleLimitHit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -626,12 +626,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustargetsyncfailure summary: Prometheus has failed to sync targets. expr: increase(prometheus_target_sync_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[30m]) > 0 - for: 5m + for: {{ dig "PrometheusTargetSyncFailure" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusTargetSyncFailure" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -654,12 +654,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheushighqueryload summary: Prometheus is reaching its maximum capacity serving concurrent requests. expr: avg_over_time(prometheus_engine_queries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.8 - for: 15m + for: {{ dig "PrometheusHighQueryLoad" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusHighQueryLoad" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -689,12 +689,12 @@ spec: ) * 100 > 3 - for: 15m + for: {{ dig "PrometheusErrorSendingAlertsToAnyAlertmanager" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusErrorSendingAlertsToAnyAlertmanager" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/values.yaml b/charts/kube-prometheus-stack/values.yaml index 529f5d048484..6a7cb06e2cdd 100644 --- a/charts/kube-prometheus-stack/values.yaml +++ b/charts/kube-prometheus-stack/values.yaml @@ -33,6 +33,15 @@ commonLabels: {} crds: enabled: true +## custom Rules to override "for" and "severity" in defaultRules +## +customRules: {} + # AlertmanagerFailedReload: + # for: 3m + # AlertmanagerMembersInconsistent: + # for: 5m + # severity: "warning" + ## Create default rules for monitoring the cluster ## defaultRules: