From a829d933bad177512a4f5ae82a29cb1f4b60979e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Otto=20Kr=C3=B6pke?= Date: Fri, 13 Oct 2023 12:36:51 +0200 Subject: [PATCH 1/2] [kube-prometheus-stack] Allow to configure keep_firing_for for all Prometheus rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jan-Otto Kröpke --- charts/kube-prometheus-stack/Chart.yaml | 2 +- .../hack/sync_prometheus_rules.py | 23 +++++++ .../rules-1.14/alertmanager.rules.yaml | 24 +++++++ .../rules-1.14/config-reloaders.yaml | 3 + .../templates/prometheus/rules-1.14/etcd.yaml | 45 +++++++++++++ .../prometheus/rules-1.14/general.rules.yaml | 3 + .../rules-1.14/kube-apiserver-slos.yaml | 12 ++++ .../rules-1.14/kube-state-metrics.yaml | 12 ++++ .../rules-1.14/kubernetes-apps.yaml | 45 +++++++++++++ .../rules-1.14/kubernetes-resources.yaml | 24 +++++++ .../rules-1.14/kubernetes-storage.yaml | 15 +++++ .../kubernetes-system-apiserver.yaml | 15 +++++ .../kubernetes-system-controller-manager.yaml | 3 + .../kubernetes-system-kube-proxy.yaml | 3 + .../rules-1.14/kubernetes-system-kubelet.yaml | 27 ++++++++ .../kubernetes-system-scheduler.yaml | 3 + .../rules-1.14/kubernetes-system.yaml | 6 ++ .../prometheus/rules-1.14/node-exporter.yaml | 63 ++++++++++++++++++ .../prometheus/rules-1.14/node-network.yaml | 3 + .../rules-1.14/prometheus-operator.yaml | 21 ++++++ .../prometheus/rules-1.14/prometheus.yaml | 66 +++++++++++++++++++ charts/kube-prometheus-stack/values.yaml | 3 + 22 files changed, 420 insertions(+), 1 deletion(-) diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index eaa01743d06e..58677f758240 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -21,7 +21,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 51.6.1 +version: 51.7.0 appVersion: v0.68.0 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus diff --git a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py index 3a1433614e2f..55a32de3fc65 100755 --- a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py +++ b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py @@ -358,6 +358,28 @@ def add_custom_annotations(rules, group, indent=4): return rules + +def add_custom_keep_firing_for(rules, indent=4): + """Add if wrapper for additional rules annotations""" + indent_spaces = " " * indent + " " + keep_firing_fore = (indent_spaces + '{{- with .Values.defaultRules.keepFiringFor }}\n' + + indent_spaces + 'keep_firing_for: "{{ . }}"\n' + + indent_spaces + '{{- end }}') + keep_firing_fore_len = len(keep_firing_fore) + 1 + + separator = " " * indent + " for:.*" + alerts_positions = re.finditer(separator, rules) + alert = 0 + + for alert_position in alerts_positions: + # Add rule_condition after 'annotations:' statement + index = alert_position.end() + keep_firing_fore_len * alert + rules = rules[:index] + "\n" + keep_firing_fore + rules[index:] + alert += 1 + + return rules + + def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes): fix_expr(group['rules']) group_name = group['name'] @@ -374,6 +396,7 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes) # append per-alert rules rules = add_custom_labels(rules, group) rules = add_custom_annotations(rules, group) + rules = add_custom_keep_firing_for(rules) rules = add_rules_conditions_from_condition_map(rules) rules = add_rules_per_rule_conditions(rules, group) # initialize header diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml index 6b31792792c8..d2e0c8bb7e73 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml @@ -43,6 +43,9 @@ spec: # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} @@ -73,6 +76,9 @@ spec: < on (namespace,service,cluster) group_left count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} @@ -104,6 +110,9 @@ spec: ) > 0.01 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} @@ -135,6 +144,9 @@ spec: ) > 0.01 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} @@ -166,6 +178,9 @@ spec: ) > 0.01 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} @@ -195,6 +210,9 @@ spec: ) != 1 for: 20m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} @@ -230,6 +248,9 @@ spec: ) >= 0.5 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} @@ -265,6 +286,9 @@ spec: ) >= 0.5 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml index 1138c64dbc50..3c517306411c 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml @@ -40,6 +40,9 @@ spec: summary: config-reloader sidecar has not had a successful reload for 10m expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.configReloaders }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml index d5d9b73f63c9..8380d27eb58e 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml @@ -45,6 +45,9 @@ spec: ) > 0 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -69,6 +72,9 @@ spec: summary: etcd cluster has insufficient number of members. expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) for: 3m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -93,6 +99,9 @@ spec: summary: etcd cluster has no leader. expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 for: 1m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -117,6 +126,9 @@ spec: summary: etcd cluster has high number of leader changes. expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -145,6 +157,9 @@ spec: sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 1 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -173,6 +188,9 @@ spec: sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -199,6 +217,9 @@ spec: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -225,6 +246,9 @@ spec: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -249,6 +273,9 @@ spec: summary: etcd cluster has high number of proposal failures. expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -275,6 +302,9 @@ spec: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -301,6 +331,9 @@ spec: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 1 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -327,6 +360,9 @@ spec: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -351,6 +387,9 @@ spec: summary: etcd cluster database is running full. expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -375,6 +414,9 @@ spec: summary: etcd cluster database growing very fast. expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"} for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} @@ -400,6 +442,9 @@ spec: summary: etcd database size in use is less than 50% of the actual allocated storage. expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml index 6b8dffd59ec8..8139fcaed93b 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml @@ -38,6 +38,9 @@ spec: summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml index 5a26e472ff2e..075bb9da1283 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml @@ -41,6 +41,9 @@ spec: and sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) for: 2m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: long: 1h severity: critical @@ -71,6 +74,9 @@ spec: and sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: long: 6h severity: critical @@ -101,6 +107,9 @@ spec: and sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: long: 1d severity: warning @@ -131,6 +140,9 @@ spec: and sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) for: 3h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: long: 3d severity: warning diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml index 49307d611e20..55ca01658aeb 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml @@ -42,6 +42,9 @@ spec: sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster)) > 0.01 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} @@ -71,6 +74,9 @@ spec: sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) > 0.01 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} @@ -96,6 +102,9 @@ spec: summary: kube-state-metrics sharding is misconfigured. expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} @@ -125,6 +134,9 @@ spec: sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster) != 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml index b3723df8187f..513a1f309eac 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml @@ -39,6 +39,9 @@ spec: summary: Pod is crash looping. expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m]) >= 1 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -71,6 +74,9 @@ spec: ) ) > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -99,6 +105,9 @@ spec: != kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -133,6 +142,9 @@ spec: 0 ) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -160,6 +172,9 @@ spec: kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} != 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -194,6 +209,9 @@ spec: 0 ) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -222,6 +240,9 @@ spec: != kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -264,6 +285,9 @@ spec: 0 ) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -312,6 +336,9 @@ spec: 0 ) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -337,6 +364,9 @@ spec: summary: Pod container waiting longer than 1 hour expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0 for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -365,6 +395,9 @@ spec: - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -390,6 +423,9 @@ spec: summary: DaemonSet pods are misscheduled. expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -442,6 +478,9 @@ spec: summary: Job failed to complete. expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -480,6 +519,9 @@ spec: and changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -508,6 +550,9 @@ spec: == kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml index 262d4db8d3bd..324b6e1e4932 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml @@ -41,6 +41,9 @@ spec: and (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} @@ -69,6 +72,9 @@ spec: and (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} @@ -98,6 +104,9 @@ spec: sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) > 1.5 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} @@ -127,6 +136,9 @@ spec: sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) > 1.5 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} @@ -156,6 +168,9 @@ spec: (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 0.9 < 1 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: info {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} @@ -185,6 +200,9 @@ spec: (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) == 1 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: info {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} @@ -214,6 +232,9 @@ spec: (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 1 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} @@ -243,6 +264,9 @@ spec: sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) > ( 25 / 100 ) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: info {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml index 242466b38d8e..631af0e18b79 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml @@ -50,6 +50,9 @@ spec: unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} @@ -88,6 +91,9 @@ spec: unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} @@ -124,6 +130,9 @@ spec: unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} @@ -162,6 +171,9 @@ spec: unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} @@ -187,6 +199,9 @@ spec: summary: PersistentVolume is having issues with provisioning. expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml index 80df51c4724d..bf4f27a0d200 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml @@ -38,6 +38,9 @@ spec: summary: Client certificate is about to expire. expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -63,6 +66,9 @@ spec: summary: Client certificate is about to expire. expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -112,6 +118,9 @@ spec: summary: Kubernetes aggregated API is down. expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -138,6 +147,9 @@ spec: summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="apiserver"} == 1) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -164,6 +176,9 @@ spec: summary: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests. expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml index 6a5a98a631b9..8c8d94379c83 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml @@ -39,6 +39,9 @@ spec: summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-controller-manager"} == 1) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeControllerManager }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml index 51ad87c6f7c4..f52f36d4f63b 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml @@ -40,6 +40,9 @@ spec: summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-proxy"} == 1) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeProxy }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml index 1bd4a2fb3f37..643ec0f3fb5f 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml @@ -38,6 +38,9 @@ spec: summary: Node is not ready. expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -63,6 +66,9 @@ spec: summary: Node is unreachable. expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -95,6 +101,9 @@ spec: kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 ) > 0.95 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: info {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -120,6 +129,9 @@ spec: summary: Node readiness status is flapping. expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -145,6 +157,9 @@ spec: summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -170,6 +185,9 @@ spec: summary: Kubelet Pod startup latency is too high. expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -291,6 +309,9 @@ spec: summary: Kubelet has failed to renew its client certificate. expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -316,6 +337,9 @@ spec: summary: Kubelet has failed to renew its server certificate. expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -342,6 +366,9 @@ spec: summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml index 63ff60d816fe..7ca3c9b9fdb7 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml @@ -39,6 +39,9 @@ spec: summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-scheduler"} == 1) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerAlerting }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml index 99ee861922f4..f75ba36ee93a 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml @@ -38,6 +38,9 @@ spec: summary: Different semantic versions of Kubernetes components running. expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -67,6 +70,9 @@ spec: sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace)) > 0.01 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml index 1b02ab97a2fe..5c14e28f75bf 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml @@ -45,6 +45,9 @@ spec: node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -77,6 +80,9 @@ spec: node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -107,6 +113,9 @@ spec: node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 30m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -137,6 +146,9 @@ spec: node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 30m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -169,6 +181,9 @@ spec: node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -201,6 +216,9 @@ spec: node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -231,6 +249,9 @@ spec: node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -261,6 +282,9 @@ spec: node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -286,6 +310,9 @@ spec: summary: Network interface is reporting many receive errors. expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01 for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -311,6 +338,9 @@ spec: summary: Network interface is reporting many transmit errors. expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01 for: 1h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -395,6 +425,9 @@ spec: deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -423,6 +456,9 @@ spec: and node_timex_maxerror_seconds{job="node-exporter"} >= 16 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -448,6 +484,9 @@ spec: summary: RAID Array is degraded. expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -500,6 +539,9 @@ spec: node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 ) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -528,6 +570,9 @@ spec: node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 ) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -555,6 +600,9 @@ spec: summary: High CPU usage. expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: info {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -586,6 +634,9 @@ spec: node_load1{job="node-exporter"} / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -615,6 +666,9 @@ spec: summary: Memory major page faults are occurring at very high rate. expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -642,6 +696,9 @@ spec: summary: Host is running out of memory. expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -671,6 +728,9 @@ spec: summary: Disk IO queue is high. expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10 for: 30m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} @@ -696,6 +756,9 @@ spec: summary: Systemd service has entered failed state. expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml index 57c8b86f65b2..bc390506fb59 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml @@ -38,6 +38,9 @@ spec: summary: Network interface is often changing its status expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 for: 2m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.network }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml index 52b5e6c50936..a5d679c0134d 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml @@ -40,6 +40,9 @@ spec: summary: Errors while performing list operations in controller. expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} @@ -65,6 +68,9 @@ spec: summary: Errors while performing watch operations in controller. expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.4 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} @@ -90,6 +96,9 @@ spec: summary: Last controller reconciliation failed expr: min_over_time(prometheus_operator_syncs{status="failed",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} @@ -115,6 +124,9 @@ spec: summary: Errors while reconciling controller. expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} @@ -140,6 +152,9 @@ spec: summary: Errors while reconciling Prometheus. expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} @@ -165,6 +180,9 @@ spec: summary: Prometheus operator not ready expr: min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) == 0) for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} @@ -190,6 +208,9 @@ spec: summary: Resources rejected by Prometheus operator expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml index 92288decab3e..48cfc7a449a2 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml @@ -43,6 +43,9 @@ spec: # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) == 0 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -68,6 +71,9 @@ spec: summary: Failed Prometheus SD refresh. expr: increase(prometheus_sd_refresh_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[10m]) > 0 for: 20m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -100,6 +106,9 @@ spec: min_over_time(prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) ) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -132,6 +141,9 @@ spec: * 100 > 1 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -160,6 +172,9 @@ spec: # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) < 1 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -185,6 +200,9 @@ spec: summary: Prometheus has issues reloading blocks from disk. expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 for: 4h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -210,6 +228,9 @@ spec: summary: Prometheus has issues compacting blocks. expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 for: 4h + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -244,6 +265,9 @@ spec: ) ) for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -269,6 +293,9 @@ spec: summary: Prometheus is dropping samples with duplicate timestamps. expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -294,6 +321,9 @@ spec: summary: Prometheus drops samples with out-of-order timestamps. expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 10m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -330,6 +360,9 @@ spec: * 100 > 1 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -363,6 +396,9 @@ spec: ) > 120 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -395,6 +431,9 @@ spec: max_over_time(prometheus_remote_storage_shards_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) ) for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -420,6 +459,9 @@ spec: summary: Prometheus is failing rule evaluations. expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -445,6 +487,9 @@ spec: summary: Prometheus is missing rule evaluations due to slow rule group evaluation. expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -470,6 +515,9 @@ spec: summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -495,6 +543,9 @@ spec: summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -520,6 +571,9 @@ spec: summary: Prometheus has dropped some targets that exceeded body size limit. expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -545,6 +599,9 @@ spec: summary: Prometheus has failed scrapes that have exceeded the configured sample limit. expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -570,6 +627,9 @@ spec: summary: Prometheus has failed to sync targets. expr: increase(prometheus_target_sync_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[30m]) > 0 for: 5m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -595,6 +655,9 @@ spec: summary: Prometheus is reaching its maximum capacity serving concurrent requests. expr: avg_over_time(prometheus_engine_queries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.8 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} @@ -627,6 +690,9 @@ spec: * 100 > 3 for: 15m + {{- with .Values.defaultRules.keepFiringFor }} + keep_firing_for: "{{ . }}" + {{- end }} labels: severity: critical {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} diff --git a/charts/kube-prometheus-stack/values.yaml b/charts/kube-prometheus-stack/values.yaml index c6592604ad0e..80c08935f180 100644 --- a/charts/kube-prometheus-stack/values.yaml +++ b/charts/kube-prometheus-stack/values.yaml @@ -70,6 +70,9 @@ defaultRules: ## Reduce app namespace alert scope appNamespacesTarget: ".*" + ## Set keep_firing_for for all alerts + keepFiringFor: "" + ## Labels for default rules labels: {} ## Annotations for default rules From 841bbdd18b4a5e2ac6dbd8e8072736ddb070caf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Otto=20Kr=C3=B6pke?= Date: Fri, 13 Oct 2023 15:51:51 +0200 Subject: [PATCH 2/2] s/keep_firing_fore/keep_firing_for MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jan-Otto Kröpke --- .../kube-prometheus-stack/hack/sync_prometheus_rules.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py index 55a32de3fc65..8a6b3edb68bf 100755 --- a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py +++ b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py @@ -362,10 +362,10 @@ def add_custom_annotations(rules, group, indent=4): def add_custom_keep_firing_for(rules, indent=4): """Add if wrapper for additional rules annotations""" indent_spaces = " " * indent + " " - keep_firing_fore = (indent_spaces + '{{- with .Values.defaultRules.keepFiringFor }}\n' + + keep_firing_for = (indent_spaces + '{{- with .Values.defaultRules.keepFiringFor }}\n' + indent_spaces + 'keep_firing_for: "{{ . }}"\n' + indent_spaces + '{{- end }}') - keep_firing_fore_len = len(keep_firing_fore) + 1 + keep_firing_for_len = len(keep_firing_for) + 1 separator = " " * indent + " for:.*" alerts_positions = re.finditer(separator, rules) @@ -373,8 +373,8 @@ def add_custom_keep_firing_for(rules, indent=4): for alert_position in alerts_positions: # Add rule_condition after 'annotations:' statement - index = alert_position.end() + keep_firing_fore_len * alert - rules = rules[:index] + "\n" + keep_firing_fore + rules[index:] + index = alert_position.end() + keep_firing_for_len * alert + rules = rules[:index] + "\n" + keep_firing_for + rules[index:] alert += 1 return rules