From aeada2226cda35cdfc4eb4c9eae8156c85eb3938 Mon Sep 17 00:00:00 2001 From: Ilia Lazebnik Date: Sun, 18 Feb 2024 03:13:19 +0200 Subject: [PATCH 01/16] bump pingmesh to 1.2.1 (#4260) Signed-off-by: drfaust92 --- charts/prometheus-pingmesh-exporter/Chart.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/charts/prometheus-pingmesh-exporter/Chart.yaml b/charts/prometheus-pingmesh-exporter/Chart.yaml index ff33cb2db864..1b1483c5e1e0 100644 --- a/charts/prometheus-pingmesh-exporter/Chart.yaml +++ b/charts/prometheus-pingmesh-exporter/Chart.yaml @@ -1,17 +1,17 @@ apiVersion: v2 -appVersion: "v1.1.0" +appVersion: "v1.2.1" description: Prometheus Pingmesh Exporter home: https://stack.kubeservice.cn/ icon: https://raw.githubusercontent.com/prometheus/prometheus.github.io/master/assets/prometheus_logo-cb55bb5c346.png keywords: -- prometheus -- pingmesh -- monitoring + - prometheus + - pingmesh + - monitoring maintainers: -- email: dongjiang1989@126.com - name: dongjiang1989 + - email: dongjiang1989@126.com + name: dongjiang1989 name: prometheus-pingmesh-exporter sources: -- https://github.com/kubeservice-stack/pingmesh-agent + - https://github.com/kubeservice-stack/pingmesh-agent type: application -version: 0.3.0 +version: 0.4.0 From 4eb488fe3e3e3248e4b55ba7ca0ac5f8bbd00df1 Mon Sep 17 00:00:00 2001 From: Ilia Lazebnik Date: Sun, 18 Feb 2024 21:42:42 +0200 Subject: [PATCH 02/16] [prometheus] bump prom deps (#4262) Signed-off-by: drfaust92 --- charts/prometheus/Chart.lock | 10 +++++----- charts/prometheus/Chart.yaml | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/charts/prometheus/Chart.lock b/charts/prometheus/Chart.lock index 32354125283c..e5b0e0faf93d 100644 --- a/charts/prometheus/Chart.lock +++ b/charts/prometheus/Chart.lock @@ -1,15 +1,15 @@ dependencies: - name: alertmanager repository: https://prometheus-community.github.io/helm-charts - version: 1.7.0 + version: 1.8.0 - name: kube-state-metrics repository: https://prometheus-community.github.io/helm-charts version: 5.16.0 - name: prometheus-node-exporter repository: https://prometheus-community.github.io/helm-charts - version: 4.26.0 + version: 4.30.2 - name: prometheus-pushgateway repository: https://prometheus-community.github.io/helm-charts - version: 2.6.0 -digest: sha256:c04ed5d0a9f4673cdf8cf43d8c09990170bd3a0c30f4529e6a06f00ea6ff2d63 -generated: "2024-01-25T19:22:54.227295+02:00" + version: 2.7.0 +digest: sha256:8f7e37ff7fde55738e5b0016425588414206eb8aea2cc860cb1d1c5fa8dabbd0 +generated: "2024-02-17T20:04:35.847534+02:00" diff --git a/charts/prometheus/Chart.yaml b/charts/prometheus/Chart.yaml index 6277f13f5e9c..90dab35d18f8 100644 --- a/charts/prometheus/Chart.yaml +++ b/charts/prometheus/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: prometheus appVersion: v2.49.1 -version: 25.12.0 +version: 25.13.0 kubeVersion: ">=1.19.0-0" description: Prometheus is a monitoring system and time series database. home: https://prometheus.io/ @@ -26,7 +26,7 @@ maintainers: type: application dependencies: - name: alertmanager - version: "1.7.*" + version: "1.8.*" repository: https://prometheus-community.github.io/helm-charts condition: alertmanager.enabled - name: kube-state-metrics @@ -34,11 +34,11 @@ dependencies: repository: https://prometheus-community.github.io/helm-charts condition: kube-state-metrics.enabled - name: prometheus-node-exporter - version: "4.26.*" + version: "4.30.*" repository: https://prometheus-community.github.io/helm-charts condition: prometheus-node-exporter.enabled - name: prometheus-pushgateway - version: "2.6.*" + version: "2.7.*" repository: https://prometheus-community.github.io/helm-charts condition: prometheus-pushgateway.enabled keywords: From bbeb7d4a61b8100f5b2c0e9bc83b7241012a95b2 Mon Sep 17 00:00:00 2001 From: Richard Tief <56597015+richardtief@users.noreply.github.com> Date: Mon, 19 Feb 2024 09:55:30 +0100 Subject: [PATCH 03/16] [kube-prometheus-stack] use of helm tpl for ingress annotations (#4236) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jan-Otto Kröpke --- charts/kube-prometheus-stack/Chart.yaml | 2 +- .../kube-prometheus-stack/templates/alertmanager/ingress.yaml | 2 +- .../templates/alertmanager/ingressperreplica.yaml | 2 +- charts/kube-prometheus-stack/templates/prometheus/ingress.yaml | 2 +- .../templates/prometheus/ingressThanosSidecar.yaml | 2 +- .../templates/prometheus/ingressperreplica.yaml | 2 +- .../kube-prometheus-stack/templates/thanos-ruler/ingress.yaml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index 94e407114b03..6131ba5ac953 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -23,7 +23,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 56.7.0 +version: 56.8.0 appVersion: v0.71.2 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus diff --git a/charts/kube-prometheus-stack/templates/alertmanager/ingress.yaml b/charts/kube-prometheus-stack/templates/alertmanager/ingress.yaml index 201a69ec6fc8..be9f5aa279c4 100644 --- a/charts/kube-prometheus-stack/templates/alertmanager/ingress.yaml +++ b/charts/kube-prometheus-stack/templates/alertmanager/ingress.yaml @@ -14,7 +14,7 @@ metadata: namespace: {{ template "kube-prometheus-stack.namespace" . }} {{- if .Values.alertmanager.ingress.annotations }} annotations: -{{ toYaml .Values.alertmanager.ingress.annotations | indent 4 }} + {{- tpl (toYaml .Values.alertmanager.ingress.annotations) . | nindent 4 }} {{- end }} labels: app: {{ template "kube-prometheus-stack.name" . }}-alertmanager diff --git a/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml b/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml index f21bf961697f..8fb5d13346cb 100644 --- a/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml +++ b/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml @@ -25,7 +25,7 @@ items: {{- end }} {{- if $ingressValues.annotations }} annotations: -{{ toYaml $ingressValues.annotations | indent 8 }} + {{- tpl (toYaml $ingressValues.annotations) . | nindent 8 }} {{- end }} spec: {{- if $apiIsStable }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/ingress.yaml b/charts/kube-prometheus-stack/templates/prometheus/ingress.yaml index 91fadf905fdb..d2f6af5dd147 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/ingress.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/ingress.yaml @@ -11,7 +11,7 @@ kind: Ingress metadata: {{- if .Values.prometheus.ingress.annotations }} annotations: -{{ toYaml .Values.prometheus.ingress.annotations | indent 4 }} + {{- tpl (toYaml .Values.prometheus.ingress.annotations) . | nindent 4 }} {{- end }} name: {{ $serviceName }} namespace: {{ template "kube-prometheus-stack.namespace" . }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/ingressThanosSidecar.yaml b/charts/kube-prometheus-stack/templates/prometheus/ingressThanosSidecar.yaml index a3da99faa884..3f507cfa9f6a 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/ingressThanosSidecar.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/ingressThanosSidecar.yaml @@ -11,7 +11,7 @@ kind: Ingress metadata: {{- if .Values.prometheus.thanosIngress.annotations }} annotations: -{{ toYaml .Values.prometheus.thanosIngress.annotations | indent 4 }} + {{- tpl (toYaml .Values.prometheus.thanosIngress.annotations) . | nindent 4 }} {{- end }} name: {{ template "kube-prometheus-stack.fullname" . }}-thanos-gateway namespace: {{ template "kube-prometheus-stack.namespace" . }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml b/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml index df631993baef..a3685275d734 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml @@ -25,7 +25,7 @@ items: {{- end }} {{- if $ingressValues.annotations }} annotations: -{{ toYaml $ingressValues.annotations | indent 8 }} + {{- tpl (toYaml $ingressValues.annotations) . | nindent 8 }} {{- end }} spec: {{- if $apiIsStable }} diff --git a/charts/kube-prometheus-stack/templates/thanos-ruler/ingress.yaml b/charts/kube-prometheus-stack/templates/thanos-ruler/ingress.yaml index aed9db280f53..e245ad448e82 100644 --- a/charts/kube-prometheus-stack/templates/thanos-ruler/ingress.yaml +++ b/charts/kube-prometheus-stack/templates/thanos-ruler/ingress.yaml @@ -13,7 +13,7 @@ metadata: namespace: {{ template "kube-prometheus-stack.namespace" . }} {{- if .Values.thanosRuler.ingress.annotations }} annotations: -{{ toYaml .Values.thanosRuler.ingress.annotations | indent 4 }} + {{- tpl (toYaml .Values.thanosRuler.ingress.annotations) . | nindent 4 }} {{- end }} labels: app: {{ template "kube-prometheus-stack.thanosRuler.name" . }} From 412f4d3b6b145f2d5f6a3cf408c34141058f35e6 Mon Sep 17 00:00:00 2001 From: Ilia Lazebnik Date: Mon, 19 Feb 2024 17:32:35 +0200 Subject: [PATCH 04/16] bump conntrack to 0.4.18 (#4259) Signed-off-by: drfaust92 --- charts/prometheus-conntrack-stats-exporter/Chart.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/prometheus-conntrack-stats-exporter/Chart.yaml b/charts/prometheus-conntrack-stats-exporter/Chart.yaml index 4d20a94a1f0f..843da25ca06d 100644 --- a/charts/prometheus-conntrack-stats-exporter/Chart.yaml +++ b/charts/prometheus-conntrack-stats-exporter/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: prometheus-conntrack-stats-exporter description: A Helm chart for conntrack-stats-exporter type: application -version: 0.5.9 -appVersion: v0.4.17 +version: 0.5.10 +appVersion: v0.4.18 home: https://github.com/jwkohnen/conntrack-stats-exporter sources: - https://github.com/jwkohnen/conntrack-stats-exporter From f71c7dab23d83e8646203809a6999fe7b95f2f74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Otto=20Kr=C3=B6pke?= Date: Tue, 20 Feb 2024 21:52:48 +0100 Subject: [PATCH 05/16] [kube-prometheus-stack] fix helm error in ingressperreplica.yaml (#4270) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jan-Otto Kröpke --- charts/kube-prometheus-stack/Chart.yaml | 2 +- .../templates/prometheus/ingressperreplica.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index 6131ba5ac953..98b141bcbd17 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -23,7 +23,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 56.8.0 +version: 56.8.1 appVersion: v0.71.2 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus diff --git a/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml b/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml index a3685275d734..1d76d135c8dc 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/ingressperreplica.yaml @@ -25,7 +25,7 @@ items: {{- end }} {{- if $ingressValues.annotations }} annotations: - {{- tpl (toYaml $ingressValues.annotations) . | nindent 8 }} + {{- tpl (toYaml $ingressValues.annotations) $ | nindent 8 }} {{- end }} spec: {{- if $apiIsStable }} From 030b87af489e3718310dd1ea73cc7499e1068d7f Mon Sep 17 00:00:00 2001 From: Ilia Lazebnik Date: Tue, 20 Feb 2024 23:14:26 +0200 Subject: [PATCH 06/16] [prometheus-elasticsearch-exporter] add support for podmonitor (#4264) * [prometheus-elasticsearch-exporter] Add PodMonitor support Signed-off-by: ShlomiTubul * [prometheus-elasticsearch-exporter] revert deafult values,bump version Signed-off-by: ShlomiTubul * add support for podmomitor Signed-off-by: drfaust92 * add support for podmomitor Signed-off-by: drfaust92 * add support for podmomitor Signed-off-by: drfaust92 * Update charts/prometheus-elasticsearch-exporter/templates/podmonitor.yaml Co-authored-by: zeritti <47476160+zeritti@users.noreply.github.com> Signed-off-by: Ilia Lazebnik * Update charts/prometheus-elasticsearch-exporter/values.yaml Co-authored-by: zeritti <47476160+zeritti@users.noreply.github.com> Signed-off-by: Ilia Lazebnik * Update charts/prometheus-elasticsearch-exporter/templates/podmonitor.yaml Co-authored-by: zeritti <47476160+zeritti@users.noreply.github.com> Signed-off-by: Ilia Lazebnik * Update charts/prometheus-elasticsearch-exporter/values.yaml Co-authored-by: zeritti <47476160+zeritti@users.noreply.github.com> Signed-off-by: Ilia Lazebnik * Update charts/prometheus-elasticsearch-exporter/values.yaml Co-authored-by: zeritti <47476160+zeritti@users.noreply.github.com> Signed-off-by: Ilia Lazebnik * CR comments Signed-off-by: drfaust92 * remove job label Signed-off-by: drfaust92 * Update charts/prometheus-elasticsearch-exporter/values.yaml Co-authored-by: zeritti <47476160+zeritti@users.noreply.github.com> Signed-off-by: Ilia Lazebnik --------- Signed-off-by: ShlomiTubul Signed-off-by: drfaust92 Signed-off-by: Ilia Lazebnik Co-authored-by: ShlomiTubul Co-authored-by: zeritti <47476160+zeritti@users.noreply.github.com> --- .../Chart.yaml | 2 +- .../templates/podmonitor.yaml | 50 +++++++++++++++++++ .../templates/service.yaml | 2 + .../values.yaml | 17 +++++++ 4 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 charts/prometheus-elasticsearch-exporter/templates/podmonitor.yaml diff --git a/charts/prometheus-elasticsearch-exporter/Chart.yaml b/charts/prometheus-elasticsearch-exporter/Chart.yaml index ab28150b3b9a..daf6a25fb784 100644 --- a/charts/prometheus-elasticsearch-exporter/Chart.yaml +++ b/charts/prometheus-elasticsearch-exporter/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v1 description: Elasticsearch stats exporter for Prometheus name: prometheus-elasticsearch-exporter -version: 5.5.0 +version: 5.6.0 kubeVersion: ">=1.10.0-0" appVersion: "v1.7.0" home: https://github.com/prometheus-community/elasticsearch_exporter diff --git a/charts/prometheus-elasticsearch-exporter/templates/podmonitor.yaml b/charts/prometheus-elasticsearch-exporter/templates/podmonitor.yaml new file mode 100644 index 000000000000..fe4fb30cc615 --- /dev/null +++ b/charts/prometheus-elasticsearch-exporter/templates/podmonitor.yaml @@ -0,0 +1,50 @@ +{{- if and .Values.podMonitor.enabled .Values.serviceMonitor.enabled }} +{{- fail "Either .Values.podMonitor.enabled or .Values.serviceMonitor.enabled can be enabled at a time, but not both." }} +{{- else if .Values.podMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ template "elasticsearch-exporter.fullname" . }} + {{- if .Values.podMonitor.namespace }} + namespace: {{ .Values.podMonitor.namespace }} + {{- end }} + labels: + chart: {{ template "elasticsearch-exporter.chart" . }} + app: {{ template "elasticsearch-exporter.name" . }} + release: "{{ .Release.Name }}" + heritage: "{{ .Release.Service }}" + {{- if .Values.podMonitor.labels }} + {{- toYaml .Values.podMonitor.labels | nindent 4 }} + {{- end }} +spec: + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + podMetricsEndpoints: + - path: {{ .Values.web.path }} + port: {{ .Values.deployment.metricsPort.name }} + {{- if .Values.podMonitor.scheme }} + scheme: {{ .Values.podMonitor.scheme }} + {{- end }} + {{- if .Values.podMonitor.interval }} + interval: {{ .Values.podMonitor.interval }} + {{- end }} + {{- if .Values.podMonitor.scrapeTimeout }} + scrapeTimeout: {{ .Values.podMonitor.scrapeTimeout }} + {{- end }} + {{- if .Values.podMonitor.honorLabels }} + honorLabels: true + {{- end }} + {{- if .Values.podMonitor.metricRelabelings }} + metricRelabelings: + {{- toYaml .Values.podMonitor.metricRelabelings | nindent 6 }} + {{- end }} + {{- if .Values.podMonitor.relabelings }} + relabelings: + {{- toYaml .Values.podMonitor.relabelings | nindent 6 }} + {{- end }} + selector: + matchLabels: + app: {{ template "elasticsearch-exporter.name" . }} + release: "{{ .Release.Name }}" +{{- end }} diff --git a/charts/prometheus-elasticsearch-exporter/templates/service.yaml b/charts/prometheus-elasticsearch-exporter/templates/service.yaml index d268ca5748d9..a3420a999392 100644 --- a/charts/prometheus-elasticsearch-exporter/templates/service.yaml +++ b/charts/prometheus-elasticsearch-exporter/templates/service.yaml @@ -1,3 +1,4 @@ +{{- if .Values.service.enabled }} kind: Service apiVersion: v1 metadata: @@ -23,3 +24,4 @@ spec: selector: app: {{ template "elasticsearch-exporter.name" . }} release: "{{ .Release.Name }}" +{{- end }} diff --git a/charts/prometheus-elasticsearch-exporter/values.yaml b/charts/prometheus-elasticsearch-exporter/values.yaml index f70dd9079180..456731379f4e 100644 --- a/charts/prometheus-elasticsearch-exporter/values.yaml +++ b/charts/prometheus-elasticsearch-exporter/values.yaml @@ -71,6 +71,7 @@ affinity: {} initContainers: [] service: + enabled: true type: ClusterIP httpPort: 9108 metricsPort: @@ -81,6 +82,8 @@ service: deployment: annotations: {} labels: {} + metricsPort: + name: http ## Extra environment variables that will be passed into the exporter pod ## example: @@ -250,6 +253,20 @@ serviceMonitor: metricRelabelings: [] sampleLimit: 0 +podMonitor: + ## If true, a PodMonitor CRD is created for a Prometheus Operator + ## https://prometheus-operator.dev/docs/operator/api/#monitoring.coreos.com/v1.PodMonitor + ## + enabled: false + namespace: "" + labels: {} + interval: 60s + scrapeTimeout: 10s + honorLabels: true + scheme: http + relabelings: [] + metricRelabelings: [] + prometheusRule: ## If true, a PrometheusRule CRD is created for a prometheus operator ## https://github.com/coreos/prometheus-operator From d97743ef1d4761fefc4ad503587f5168bc5d927e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Otto=20Kr=C3=B6pke?= Date: Tue, 20 Feb 2024 23:00:10 +0100 Subject: [PATCH 07/16] [kube-prometheus-stack] fix helm error in am/ingressperreplica.yaml (#4271) --- charts/kube-prometheus-stack/Chart.yaml | 2 +- .../templates/alertmanager/ingressperreplica.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index 98b141bcbd17..ae8dbb37ca6f 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -23,7 +23,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 56.8.1 +version: 56.8.2 appVersion: v0.71.2 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus diff --git a/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml b/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml index 8fb5d13346cb..b2e00a416290 100644 --- a/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml +++ b/charts/kube-prometheus-stack/templates/alertmanager/ingressperreplica.yaml @@ -25,7 +25,7 @@ items: {{- end }} {{- if $ingressValues.annotations }} annotations: - {{- tpl (toYaml $ingressValues.annotations) . | nindent 8 }} + {{- tpl (toYaml $ingressValues.annotations) $ | nindent 8 }} {{- end }} spec: {{- if $apiIsStable }} From 71bba71b24dce2edcde643ef6bc1ecac9b70632a Mon Sep 17 00:00:00 2001 From: Sheikh-Abubaker Date: Wed, 21 Feb 2024 20:26:32 +0530 Subject: [PATCH 08/16] [prometheus-blackbox-exporter] define port field in selfservicemonitor (#4258) * define port field in selfservicemonitor Signed-off-by: Sheikh-Abubaker * Update values.yaml Signed-off-by: Sheikh-Abubaker * Update values.yaml Signed-off-by: Sheikh-Abubaker --------- Signed-off-by: Sheikh-Abubaker --- charts/prometheus-blackbox-exporter/Chart.yaml | 2 +- .../templates/selfservicemonitor.yaml | 3 +++ charts/prometheus-blackbox-exporter/values.yaml | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/charts/prometheus-blackbox-exporter/Chart.yaml b/charts/prometheus-blackbox-exporter/Chart.yaml index 422aa8c92ff6..62b9e003ea63 100644 --- a/charts/prometheus-blackbox-exporter/Chart.yaml +++ b/charts/prometheus-blackbox-exporter/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 description: Prometheus Blackbox Exporter name: prometheus-blackbox-exporter -version: 8.10.1 +version: 8.11.0 appVersion: v0.24.0 home: https://github.com/prometheus/blackbox_exporter sources: diff --git a/charts/prometheus-blackbox-exporter/templates/selfservicemonitor.yaml b/charts/prometheus-blackbox-exporter/templates/selfservicemonitor.yaml index 56ec3030a259..9063688ebeba 100644 --- a/charts/prometheus-blackbox-exporter/templates/selfservicemonitor.yaml +++ b/charts/prometheus-blackbox-exporter/templates/selfservicemonitor.yaml @@ -16,6 +16,9 @@ spec: interval: {{ .Values.serviceMonitor.selfMonitor.interval }} scrapeTimeout: {{ .Values.serviceMonitor.selfMonitor.scrapeTimeout }} scheme: http + {{- with .Values.serviceMonitor.selfMonitor.port }} + port: {{ . }} + {{- end }} {{- if .Values.serviceMonitor.selfMonitor.additionalRelabeling }} relabelings: {{- toYaml .Values.serviceMonitor.selfMonitor.additionalRelabeling | nindent 6 }} diff --git a/charts/prometheus-blackbox-exporter/values.yaml b/charts/prometheus-blackbox-exporter/values.yaml index 98acfb0cd327..4c626bca8d17 100644 --- a/charts/prometheus-blackbox-exporter/values.yaml +++ b/charts/prometheus-blackbox-exporter/values.yaml @@ -274,6 +274,8 @@ serviceMonitor: path: /metrics interval: 30s scrapeTimeout: 30s + ## Port can be defined by assigning a value for the port key below + ## port: ## If true, a ServiceMonitor CRD is created for a prometheus operator ## https://github.com/coreos/prometheus-operator for each target From 33697bec952f9325f917c3cfec01cd2e9d2f3d3a Mon Sep 17 00:00:00 2001 From: ps-xaf Date: Thu, 22 Feb 2024 13:15:53 +0100 Subject: [PATCH 09/16] [kube-prometheus-stack] allow override of for and severity rules (#4225) --- charts/kube-prometheus-stack/Chart.yaml | 2 +- .../ci/03-non-defaults-values.yaml | 7 ++ .../hack/sync_prometheus_rules.py | 59 ++++++++++++ .../rules-1.14/alertmanager.rules.yaml | 32 +++---- .../rules-1.14/config-reloaders.yaml | 4 +- .../templates/prometheus/rules-1.14/etcd.yaml | 60 ++++++------ .../prometheus/rules-1.14/general.rules.yaml | 8 +- .../rules-1.14/kube-apiserver-slos.yaml | 16 ++-- .../rules-1.14/kube-state-metrics.yaml | 16 ++-- .../rules-1.14/kubernetes-apps.yaml | 62 ++++++------ .../rules-1.14/kubernetes-resources.yaml | 32 +++---- .../rules-1.14/kubernetes-storage.yaml | 30 +++--- .../kubernetes-system-apiserver.yaml | 22 ++--- .../kubernetes-system-controller-manager.yaml | 4 +- .../kubernetes-system-kube-proxy.yaml | 6 +- .../rules-1.14/kubernetes-system-kubelet.yaml | 44 ++++----- .../kubernetes-system-scheduler.yaml | 4 +- .../rules-1.14/kubernetes-system.yaml | 8 +- .../prometheus/rules-1.14/node-exporter.yaml | 96 +++++++++---------- .../prometheus/rules-1.14/node-network.yaml | 4 +- .../rules-1.14/prometheus-operator.yaml | 32 +++---- .../prometheus/rules-1.14/prometheus.yaml | 90 ++++++++--------- charts/kube-prometheus-stack/values.yaml | 9 ++ 23 files changed, 361 insertions(+), 286 deletions(-) diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index ae8dbb37ca6f..49b45f92d057 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -23,7 +23,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 56.8.2 +version: 56.9.0 appVersion: v0.71.2 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus diff --git a/charts/kube-prometheus-stack/ci/03-non-defaults-values.yaml b/charts/kube-prometheus-stack/ci/03-non-defaults-values.yaml index c50ff240652c..0838274de5fa 100644 --- a/charts/kube-prometheus-stack/ci/03-non-defaults-values.yaml +++ b/charts/kube-prometheus-stack/ci/03-non-defaults-values.yaml @@ -33,3 +33,10 @@ prometheus: logFormat: json additionalConfigString: |- logLevel: {{ print "debug" | quote }} + +customRules: + AlertmanagerFailedReload: + for: 3m + AlertmanagerMembersInconsistent: + for: 5m + severity: "warning" diff --git a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py index a621b3d6c1ce..41128bf37543 100755 --- a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py +++ b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py @@ -406,6 +406,63 @@ def add_custom_keep_firing_for(rules, indent=4): return rules +def add_custom_for(rules, indent=4): + """Add custom 'for:' condition in rules""" + replace_field = "for:" + rules = add_custom_alert_rules(rules, replace_field, indent) + + return rules + + +def add_custom_severity(rules, indent=4): + """Add custom 'severity:' condition in rules""" + replace_field = "severity:" + rules = add_custom_alert_rules(rules, replace_field, indent) + + return rules + + +def add_custom_alert_rules(rules, key_to_replace, indent): + """Extend alert field to allow custom values""" + key_to_replace_indented = ' ' * indent + key_to_replace + alertkey_field = '- alert:' + found_alert_key = False + alertname = None + updated_rules = '' + + # pylint: disable=C0200 + i = 0 + while i < len(rules): + if rules[i:i + len(alertkey_field)] == alertkey_field: + found_alert_key = True + start_index_word_after = i + len(alertkey_field) + 1 + end_index_alertkey_field = start_index_word_after + while end_index_alertkey_field < len(rules) and rules[end_index_alertkey_field].isalnum(): + end_index_alertkey_field += 1 + + alertname = rules[start_index_word_after:end_index_alertkey_field] + + if found_alert_key: + if rules[i:i + len(key_to_replace_indented)] == key_to_replace_indented: + found_alert_key = False + start_index_key_value = i + len(key_to_replace_indented) + 1 + end_index_key_to_replace = start_index_key_value + while end_index_key_to_replace < len(rules) and rules[end_index_key_to_replace].isalnum(): + end_index_key_to_replace += 1 + + word_after_key_to_replace = rules[start_index_key_value:end_index_key_to_replace] + new_key = key_to_replace_indented + ' {{ dig "' + alertname + \ + '" "' + key_to_replace[:-1] + '" "' + \ + word_after_key_to_replace + '" .Values.customRules }}' + updated_rules += new_key + i = end_index_key_to_replace + + updated_rules += rules[i] + i += 1 + + return updated_rules + + def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes): fix_expr(group['rules']) group_name = group['name'] @@ -423,6 +480,8 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes) rules = add_custom_labels(rules, group) rules = add_custom_annotations(rules, group) rules = add_custom_keep_firing_for(rules) + rules = add_custom_for(rules) + rules = add_custom_severity(rules) rules = add_rules_conditions_from_condition_map(rules) rules = add_rules_per_rule_conditions(rules, group) # initialize header diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml index c306dc6c18ad..b262424d4aa4 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml @@ -42,12 +42,12 @@ spec: # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0 - for: 10m + for: {{ dig "AlertmanagerFailedReload" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerFailedReload" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -75,12 +75,12 @@ spec: max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) group_left count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])) - for: 15m + for: {{ dig "AlertmanagerMembersInconsistent" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerMembersInconsistent" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -109,12 +109,12 @@ spec: ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) ) > 0.01 - for: 5m + for: {{ dig "AlertmanagerFailedToSendAlerts" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "AlertmanagerFailedToSendAlerts" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -143,12 +143,12 @@ spec: ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) ) > 0.01 - for: 5m + for: {{ dig "AlertmanagerClusterFailedToSendAlerts" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerClusterFailedToSendAlerts" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -177,12 +177,12 @@ spec: ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) ) > 0.01 - for: 5m + for: {{ dig "AlertmanagerClusterFailedToSendAlerts" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "AlertmanagerClusterFailedToSendAlerts" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -209,12 +209,12 @@ spec: count_values by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) ) != 1 - for: 20m + for: {{ dig "AlertmanagerConfigInconsistent" "for" "20m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerConfigInconsistent" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -247,12 +247,12 @@ spec: ) ) >= 0.5 - for: 5m + for: {{ dig "AlertmanagerClusterDown" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerClusterDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -285,12 +285,12 @@ spec: ) ) >= 0.5 - for: 5m + for: {{ dig "AlertmanagerClusterCrashlooping" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "AlertmanagerClusterCrashlooping" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml index 3c517306411c..72ebc4cc6dee 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml @@ -39,12 +39,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/configreloadersidecarerrors summary: config-reloader sidecar has not had a successful reload for 10m expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0 - for: 10m + for: {{ dig "ConfigReloaderSidecarErrors" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "ConfigReloaderSidecarErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.configReloaders }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml index a46495f71cbc..b7529604b14b 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml @@ -44,12 +44,12 @@ spec: ) ) > 0 - for: 10m + for: {{ dig "etcdMembersDown" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdMembersDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -71,12 +71,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' summary: etcd cluster has insufficient number of members. expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) - for: 3m + for: {{ dig "etcdInsufficientMembers" "for" "3m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdInsufficientMembers" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -98,12 +98,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.' summary: etcd cluster has no leader. expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 - for: 1m + for: {{ dig "etcdNoLeader" "for" "1m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdNoLeader" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -125,12 +125,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' summary: etcd cluster has high number of leader changes. expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 - for: 5m + for: {{ dig "etcdHighNumberOfLeaderChanges" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdHighNumberOfLeaderChanges" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -156,12 +156,12 @@ spec: / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 1 - for: 10m + for: {{ dig "etcdHighNumberOfFailedGRPCRequests" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdHighNumberOfFailedGRPCRequests" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -187,12 +187,12 @@ spec: / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5 - for: 5m + for: {{ dig "etcdHighNumberOfFailedGRPCRequests" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdHighNumberOfFailedGRPCRequests" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -216,12 +216,12 @@ spec: expr: |- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15 - for: 10m + for: {{ dig "etcdGRPCRequestsSlow" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdGRPCRequestsSlow" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -245,12 +245,12 @@ spec: expr: |- histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 - for: 10m + for: {{ dig "etcdMemberCommunicationSlow" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdMemberCommunicationSlow" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -272,12 +272,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' summary: etcd cluster has high number of proposal failures. expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 - for: 15m + for: {{ dig "etcdHighNumberOfFailedProposals" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdHighNumberOfFailedProposals" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -301,12 +301,12 @@ spec: expr: |- histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 - for: 10m + for: {{ dig "etcdHighFsyncDurations" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdHighFsyncDurations" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -330,12 +330,12 @@ spec: expr: |- histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 1 - for: 10m + for: {{ dig "etcdHighFsyncDurations" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdHighFsyncDurations" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -359,12 +359,12 @@ spec: expr: |- histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 - for: 10m + for: {{ dig "etcdHighCommitDurations" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdHighCommitDurations" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -386,12 +386,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' summary: etcd cluster database is running full. expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 - for: 10m + for: {{ dig "etcdDatabaseQuotaLowSpace" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "etcdDatabaseQuotaLowSpace" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -413,12 +413,12 @@ spec: description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.' summary: etcd cluster database growing very fast. expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"} - for: 10m + for: {{ dig "etcdExcessiveDatabaseGrowth" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdExcessiveDatabaseGrowth" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -441,12 +441,12 @@ spec: runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation summary: etcd database size in use is less than 50% of the actual allocated storage. expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600 - for: 10m + for: {{ dig "etcdDatabaseHighFragmentationRatio" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "etcdDatabaseHighFragmentationRatio" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml index a62db1f06bc5..afdb1288dd13 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml @@ -37,12 +37,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/targetdown summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10 - for: 10m + for: {{ dig "TargetDown" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "TargetDown" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -76,7 +76,7 @@ spec: summary: An alert that should always be firing to certify that Alertmanager is working properly. expr: vector(1) labels: - severity: none + severity: {{ dig "Watchdog" "severity" "none" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -112,7 +112,7 @@ spec: summary: Info-level alert inhibition. expr: ALERTS{severity = "info"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 labels: - severity: none + severity: {{ dig "InfoInhibitor" "severity" "none" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml index 075bb9da1283..3f6a6a2426cc 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-slos.yaml @@ -40,13 +40,13 @@ spec: sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) and sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) - for: 2m + for: {{ dig "KubeAPIErrorBudgetBurn" "for" "2m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: long: 1h - severity: critical + severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "critical" .Values.customRules }} short: 5m {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }} {{- with .Values.defaultRules.additionalRuleLabels }} @@ -73,13 +73,13 @@ spec: sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) and sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) - for: 15m + for: {{ dig "KubeAPIErrorBudgetBurn" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: long: 6h - severity: critical + severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "critical" .Values.customRules }} short: 30m {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }} {{- with .Values.defaultRules.additionalRuleLabels }} @@ -106,13 +106,13 @@ spec: sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) and sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) - for: 1h + for: {{ dig "KubeAPIErrorBudgetBurn" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: long: 1d - severity: warning + severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "warning" .Values.customRules }} short: 2h {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }} {{- with .Values.defaultRules.additionalRuleLabels }} @@ -139,13 +139,13 @@ spec: sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) and sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) - for: 3h + for: {{ dig "KubeAPIErrorBudgetBurn" "for" "3h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: long: 3d - severity: warning + severity: {{ dig "KubeAPIErrorBudgetBurn" "severity" "warning" .Values.customRules }} short: 6h {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverSlos }} {{- with .Values.defaultRules.additionalRuleLabels }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml index 7471bd998343..93c6fe9331e0 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml @@ -42,12 +42,12 @@ spec: / sum(rate(kube_state_metrics_list_total{job="{{ $kubeStateMetricsJob }}"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0.01 - for: 15m + for: {{ dig "KubeStateMetricsListErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeStateMetricsListErrors" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -74,12 +74,12 @@ spec: / sum(rate(kube_state_metrics_watch_total{job="{{ $kubeStateMetricsJob }}"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0.01 - for: 15m + for: {{ dig "KubeStateMetricsWatchErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeStateMetricsWatchErrors" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -102,12 +102,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch summary: kube-state-metrics sharding is misconfigured. expr: stdvar (kube_state_metrics_total_shards{job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0 - for: 15m + for: {{ dig "KubeStateMetricsShardingMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeStateMetricsShardingMismatch" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -134,12 +134,12 @@ spec: - sum( 2 ^ max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="{{ $kubeStateMetricsJob }}"}) ) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0 - for: 15m + for: {{ dig "KubeStateMetricsShardsMissing" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeStateMetricsShardsMissing" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml index 48845fe7bc35..8582292a0012 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml @@ -39,12 +39,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping summary: Pod is crash looping. expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[5m]) >= 1 - for: 15m + for: {{ dig "KubePodCrashLooping" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubePodCrashLooping" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -74,12 +74,12 @@ spec: 1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) ) ) > 0 - for: 15m + for: {{ dig "KubePodNotReady" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubePodNotReady" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -105,12 +105,12 @@ spec: kube_deployment_status_observed_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != kube_deployment_metadata_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} - for: 15m + for: {{ dig "KubeDeploymentGenerationMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDeploymentGenerationMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -142,12 +142,12 @@ spec: == 0 ) - for: 15m + for: {{ dig "KubeDeploymentReplicasMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDeploymentReplicasMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -172,12 +172,12 @@ spec: expr: |- kube_deployment_status_condition{condition="Progressing", status="false",job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != 0 - for: 15m + for: {{ dig "KubeDeploymentRolloutStuck" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDeploymentRolloutStuck" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -209,12 +209,12 @@ spec: == 0 ) - for: 15m + for: {{ dig "KubeStatefulSetReplicasMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeStatefulSetReplicasMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -240,12 +240,12 @@ spec: kube_statefulset_status_observed_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != kube_statefulset_metadata_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} - for: 15m + for: {{ dig "KubeStatefulSetGenerationMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeStatefulSetGenerationMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -285,12 +285,12 @@ spec: == 0 ) - for: 15m + for: {{ dig "KubeStatefulSetUpdateNotRolledOut" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeStatefulSetUpdateNotRolledOut" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -336,12 +336,12 @@ spec: == 0 ) - for: 15m + for: {{ dig "KubeDaemonSetRolloutStuck" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDaemonSetRolloutStuck" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -364,12 +364,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting summary: Pod container waiting longer than 1 hour expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}) > 0 - for: 1h + for: {{ dig "KubeContainerWaiting" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeContainerWaiting" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -395,12 +395,12 @@ spec: kube_daemonset_status_desired_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} - kube_daemonset_status_current_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0 - for: 10m + for: {{ dig "KubeDaemonSetNotScheduled" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDaemonSetNotScheduled" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -423,12 +423,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled summary: DaemonSet pods are misscheduled. expr: kube_daemonset_status_number_misscheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0 - for: 15m + for: {{ dig "KubeDaemonSetMisScheduled" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeDaemonSetMisScheduled" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -455,7 +455,7 @@ spec: and kube_job_status_active{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0) > 43200 labels: - severity: warning + severity: {{ dig "KubeJobNotCompleted" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -478,12 +478,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed summary: Job failed to complete. expr: kube_job_failed{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0 - for: 15m + for: {{ dig "KubeJobFailed" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeJobFailed" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -519,12 +519,12 @@ spec: kube_horizontalpodautoscaler_spec_max_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}) and changes(kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[15m]) == 0 - for: 15m + for: {{ dig "KubeHpaReplicasMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeHpaReplicasMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -550,12 +550,12 @@ spec: kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} == kube_horizontalpodautoscaler_spec_max_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} - for: 15m + for: {{ dig "KubeHpaMaxedOut" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeHpaMaxedOut" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml index f0e49dc3b131..3eb2be423f30 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml @@ -41,12 +41,12 @@ spec: sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="{{ $kubeStateMetricsJob }}",}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 and (sum(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 - for: 10m + for: {{ dig "KubeCPUOvercommit" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeCPUOvercommit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -72,12 +72,12 @@ spec: sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 and (sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 - for: 10m + for: {{ dig "KubeMemoryOvercommit" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeMemoryOvercommit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -104,12 +104,12 @@ spec: / sum(kube_node_status_allocatable{resource="cpu", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) > 1.5 - for: 5m + for: {{ dig "KubeCPUQuotaOvercommit" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeCPUQuotaOvercommit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -136,12 +136,12 @@ spec: / sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) > 1.5 - for: 5m + for: {{ dig "KubeMemoryQuotaOvercommit" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeMemoryQuotaOvercommit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -168,12 +168,12 @@ spec: / ignoring(instance, job, type) (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0) > 0.9 < 1 - for: 15m + for: {{ dig "KubeQuotaAlmostFull" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: info + severity: {{ dig "KubeQuotaAlmostFull" "severity" "info" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -200,12 +200,12 @@ spec: / ignoring(instance, job, type) (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0) == 1 - for: 15m + for: {{ dig "KubeQuotaFullyUsed" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: info + severity: {{ dig "KubeQuotaFullyUsed" "severity" "info" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -232,12 +232,12 @@ spec: / ignoring(instance, job, type) (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0) > 1 - for: 15m + for: {{ dig "KubeQuotaExceeded" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeQuotaExceeded" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -264,12 +264,12 @@ spec: / sum(increase(container_cpu_cfs_periods_total{}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, container, pod, namespace) > ( 25 / 100 ) - for: 15m + for: {{ dig "CPUThrottlingHigh" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: info + severity: {{ dig "CPUThrottlingHigh" "severity" "info" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesResources }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml index 1927c7ad4ea1..dfb99607d20f 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml @@ -35,7 +35,7 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }} {{- end }} - description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free. + description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: |- @@ -50,12 +50,12 @@ spec: kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1m + for: {{ dig "KubePersistentVolumeFillingUp" "for" "1m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubePersistentVolumeFillingUp" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -74,7 +74,7 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }} {{- end }} - description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available. + description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: |- @@ -91,12 +91,12 @@ spec: kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1h + for: {{ dig "KubePersistentVolumeFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubePersistentVolumeFillingUp" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -115,7 +115,7 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }} {{- end }} - description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes. + description: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup summary: PersistentVolumeInodes are filling up. expr: |- @@ -130,12 +130,12 @@ spec: kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1m + for: {{ dig "KubePersistentVolumeInodesFillingUp" "for" "1m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubePersistentVolumeInodesFillingUp" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -154,7 +154,7 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }} {{- end }} - description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free. + description: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup summary: PersistentVolumeInodes are filling up. expr: |- @@ -171,12 +171,12 @@ spec: kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1h + for: {{ dig "KubePersistentVolumeInodesFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubePersistentVolumeInodesFillingUp" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -195,16 +195,16 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.kubernetesStorage | indent 8 }} {{- end }} - description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. + description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors summary: PersistentVolume is having issues with provisioning. expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="{{ $kubeStateMetricsJob }}"} > 0 - for: 5m + for: {{ dig "KubePersistentVolumeErrors" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubePersistentVolumeErrors" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesStorage }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml index 30601baa59e5..3e2d9c69fbcc 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml @@ -37,12 +37,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job) histogram_quantile(0.01, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 - for: 5m + for: {{ dig "KubeClientCertificateExpiration" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeClientCertificateExpiration" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -65,12 +65,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job) histogram_quantile(0.01, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 - for: 5m + for: {{ dig "KubeClientCertificateExpiration" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeClientCertificateExpiration" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -94,7 +94,7 @@ spec: summary: Kubernetes aggregated API has reported errors. expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 labels: - severity: warning + severity: {{ dig "KubeAggregatedAPIErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -117,12 +117,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapidown summary: Kubernetes aggregated API is down. expr: (1 - max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85 - for: 5m + for: {{ dig "KubeAggregatedAPIDown" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeAggregatedAPIDown" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -146,12 +146,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="apiserver"} == 1) - for: 15m + for: {{ dig "KubeAPIDown" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeAPIDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -175,12 +175,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests summary: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests. expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 - for: 5m + for: {{ dig "KubeAPITerminatedRequests" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeAPITerminatedRequests" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml index 8c8d94379c83..e24bcac0e67e 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-controller-manager.yaml @@ -38,12 +38,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontrollermanagerdown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-controller-manager"} == 1) - for: 15m + for: {{ dig "KubeControllerManagerDown" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeControllerManagerDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeControllerManager }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml index f52f36d4f63b..90fc75caff8c 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml @@ -39,12 +39,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeproxydown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-proxy"} == 1) - for: 15m + for: {{ dig "KubeProxyDown" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeProxyDown" "labelsSeverity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeProxy }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -54,4 +54,4 @@ spec: {{- end }} {{- end }} {{- end }} -{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml index 75efdd647171..b71e86607877 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml @@ -38,12 +38,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready summary: Node is not ready. expr: kube_node_status_condition{job="{{ $kubeStateMetricsJob }}",condition="Ready",status="true"} == 0 - for: 15m + for: {{ dig "KubeNodeNotReady" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeNodeNotReady" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -66,12 +66,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable summary: Node is unreachable. expr: (kube_node_spec_taint{job="{{ $kubeStateMetricsJob }}",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="{{ $kubeStateMetricsJob }}",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 - for: 15m + for: {{ dig "KubeNodeUnreachable" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeNodeUnreachable" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -101,12 +101,12 @@ spec: max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) ( kube_node_status_capacity{job="{{ $kubeStateMetricsJob }}",resource="pods"} != 1 ) > 0.95 - for: 15m + for: {{ dig "KubeletTooManyPods" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: info + severity: {{ dig "KubeletTooManyPods" "severity" "info" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -129,12 +129,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping summary: Node readiness status is flapping. expr: sum(changes(kube_node_status_condition{job="{{ $kubeStateMetricsJob }}",status="true",condition="Ready"}[15m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) > 2 - for: 15m + for: {{ dig "KubeNodeReadinessFlapping" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeNodeReadinessFlapping" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -157,12 +157,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 - for: 5m + for: {{ dig "KubeletPlegDurationHigh" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeletPlegDurationHigh" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -185,12 +185,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh summary: Kubelet Pod startup latency is too high. expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le)) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 - for: 15m + for: {{ dig "KubeletPodStartUpLatencyHigh" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeletPodStartUpLatencyHigh" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -214,7 +214,7 @@ spec: summary: Kubelet client certificate is about to expire. expr: kubelet_certificate_manager_client_ttl_seconds < 604800 labels: - severity: warning + severity: {{ dig "KubeletClientCertificateExpiration" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -238,7 +238,7 @@ spec: summary: Kubelet client certificate is about to expire. expr: kubelet_certificate_manager_client_ttl_seconds < 86400 labels: - severity: critical + severity: {{ dig "KubeletClientCertificateExpiration" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -262,7 +262,7 @@ spec: summary: Kubelet server certificate is about to expire. expr: kubelet_certificate_manager_server_ttl_seconds < 604800 labels: - severity: warning + severity: {{ dig "KubeletServerCertificateExpiration" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -286,7 +286,7 @@ spec: summary: Kubelet server certificate is about to expire. expr: kubelet_certificate_manager_server_ttl_seconds < 86400 labels: - severity: critical + severity: {{ dig "KubeletServerCertificateExpiration" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -309,12 +309,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors summary: Kubelet has failed to renew its client certificate. expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 - for: 15m + for: {{ dig "KubeletClientCertificateRenewalErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeletClientCertificateRenewalErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -337,12 +337,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors summary: Kubelet has failed to renew its server certificate. expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0 - for: 15m + for: {{ dig "KubeletServerCertificateRenewalErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeletServerCertificateRenewalErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -366,12 +366,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1) - for: 15m + for: {{ dig "KubeletDown" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeletDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml index 7ca3c9b9fdb7..4fcae45422df 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-scheduler.yaml @@ -38,12 +38,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeschedulerdown summary: Target disappeared from Prometheus target discovery. expr: absent(up{job="kube-scheduler"} == 1) - for: 15m + for: {{ dig "KubeSchedulerDown" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "KubeSchedulerDown" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeSchedulerAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml index a32747686e05..362580b72df5 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml @@ -37,12 +37,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch summary: Different semantic versions of Kubernetes components running. expr: count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 - for: 15m + for: {{ dig "KubeVersionMismatch" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeVersionMismatch" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -69,12 +69,12 @@ spec: / sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, job, namespace)) > 0.01 - for: 15m + for: {{ dig "KubeClientErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "KubeClientErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml index 34d7d833cbe2..25b2b68c840a 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml @@ -44,12 +44,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemSpaceFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeFilesystemSpaceFillingUp" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -79,12 +79,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemSpaceFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeFilesystemSpaceFillingUp" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -112,12 +112,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 30m + for: {{ dig "NodeFilesystemAlmostOutOfSpace" "for" "30m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeFilesystemAlmostOutOfSpace" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -145,12 +145,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 30m + for: {{ dig "NodeFilesystemAlmostOutOfSpace" "for" "30m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeFilesystemAlmostOutOfSpace" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -180,12 +180,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemFilesFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeFilesystemFilesFillingUp" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -215,12 +215,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemFilesFillingUp" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeFilesystemFilesFillingUp" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -248,12 +248,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemAlmostOutOfFiles" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeFilesystemAlmostOutOfFiles" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -281,12 +281,12 @@ spec: and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) - for: 1h + for: {{ dig "NodeFilesystemAlmostOutOfFiles" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeFilesystemAlmostOutOfFiles" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -309,12 +309,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs summary: Network interface is reporting many receive errors. expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01 - for: 1h + for: {{ dig "NodeNetworkReceiveErrs" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeNetworkReceiveErrs" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -337,12 +337,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs summary: Network interface is reporting many transmit errors. expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01 - for: 1h + for: {{ dig "NodeNetworkTransmitErrs" "for" "1h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeNetworkTransmitErrs" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -366,7 +366,7 @@ spec: summary: Number of conntrack are getting close to the limit. expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75 labels: - severity: warning + severity: {{ dig "NodeHighNumberConntrackEntriesUsed" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -390,7 +390,7 @@ spec: summary: Node Exporter text file collector failed to scrape. expr: node_textfile_scrape_error{job="node-exporter"} == 1 labels: - severity: warning + severity: {{ dig "NodeTextFileCollectorScrapeError" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -424,12 +424,12 @@ spec: and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) - for: 10m + for: {{ dig "NodeClockSkewDetected" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeClockSkewDetected" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -455,12 +455,12 @@ spec: min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 - for: 10m + for: {{ dig "NodeClockNotSynchronising" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeClockNotSynchronising" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -483,12 +483,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded summary: RAID Array is degraded. expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 - for: 15m + for: {{ dig "NodeRAIDDegraded" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeRAIDDegraded" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -512,7 +512,7 @@ spec: summary: Failed device in RAID array. expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0 labels: - severity: warning + severity: {{ dig "NodeRAIDDiskFailure" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -538,12 +538,12 @@ spec: ( node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 ) - for: 15m + for: {{ dig "NodeFileDescriptorLimit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeFileDescriptorLimit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -569,12 +569,12 @@ spec: ( node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 ) - for: 15m + for: {{ dig "NodeFileDescriptorLimit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "NodeFileDescriptorLimit" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -599,12 +599,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodecpuhighusage summary: High CPU usage. expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90 - for: 15m + for: {{ dig "NodeCPUHighUsage" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: info + severity: {{ dig "NodeCPUHighUsage" "severity" "info" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -633,12 +633,12 @@ spec: expr: |- node_load1{job="node-exporter"} / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 - for: 15m + for: {{ dig "NodeSystemSaturation" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeSystemSaturation" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -665,12 +665,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodememorymajorpagesfaults summary: Memory major page faults are occurring at very high rate. expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 - for: 15m + for: {{ dig "NodeMemoryMajorPagesFaults" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeMemoryMajorPagesFaults" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -695,12 +695,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodememoryhighutilization summary: Host is running out of memory. expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 - for: 15m + for: {{ dig "NodeMemoryHighUtilization" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeMemoryHighUtilization" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -719,7 +719,7 @@ spec: {{- if .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting }} {{ toYaml .Values.defaultRules.additionalRuleGroupAnnotations.nodeExporterAlerting | indent 8 }} {{- end }} - description: 'Disk IO queue (aqu-sq) is high on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}}, has been above 10 for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}. + description: 'Disk IO queue (aqu-sq) is high on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}}, has been above 10 for the last 30 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}. This symptom might indicate disk saturation. @@ -727,12 +727,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodediskiosaturation summary: Disk IO queue is high. expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10 - for: 30m + for: {{ dig "NodeDiskIOSaturation" "for" "30m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeDiskIOSaturation" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -755,12 +755,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodesystemdservicefailed summary: Systemd service has entered failed state. expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m + for: {{ dig "NodeSystemdServiceFailed" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeSystemdServiceFailed" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -783,12 +783,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodebondingdegraded summary: Bonding interface is degraded expr: (node_bonding_slaves - node_bonding_active) != 0 - for: 5m + for: {{ dig "NodeBondingDegraded" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeBondingDegraded" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.nodeExporterAlerting }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml index bc390506fb59..ecef04f22e20 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml @@ -37,12 +37,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/nodenetworkinterfaceflapping summary: Network interface is often changing its status expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 - for: 2m + for: {{ dig "NodeNetworkInterfaceFlapping" "for" "2m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "NodeNetworkInterfaceFlapping" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.network }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml index a26196df632e..bd7d97c236b1 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml @@ -39,12 +39,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorlisterrors summary: Errors while performing list operations in controller. expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4 - for: 15m + for: {{ dig "PrometheusOperatorListErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorListErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -67,12 +67,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorwatcherrors summary: Errors while performing watch operations in controller. expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m])) / sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.4 - for: 15m + for: {{ dig "PrometheusOperatorWatchErrors" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorWatchErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -95,12 +95,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorsyncfailed summary: Last controller reconciliation failed expr: min_over_time(prometheus_operator_syncs{status="failed",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m + for: {{ dig "PrometheusOperatorSyncFailed" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorSyncFailed" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -123,12 +123,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorreconcileerrors summary: Errors while reconciling objects. expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1 - for: 10m + for: {{ dig "PrometheusOperatorReconcileErrors" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorReconcileErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -151,12 +151,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorstatusupdateerrors summary: Errors while updating objects status. expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1 - for: 10m + for: {{ dig "PrometheusOperatorStatusUpdateErrors" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorStatusUpdateErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -179,12 +179,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornodelookuperrors summary: Errors while reconciling Prometheus. expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1 - for: 10m + for: {{ dig "PrometheusOperatorNodeLookupErrors" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorNodeLookupErrors" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -207,12 +207,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornotready summary: Prometheus operator not ready expr: min by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) == 0) - for: 5m + for: {{ dig "PrometheusOperatorNotReady" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorNotReady" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -235,12 +235,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorrejectedresources summary: Resources rejected by Prometheus operator expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 5m + for: {{ dig "PrometheusOperatorRejectedResources" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOperatorRejectedResources" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheusOperator }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml index 48cfc7a449a2..907f7b30e736 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml @@ -42,12 +42,12 @@ spec: # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) == 0 - for: 10m + for: {{ dig "PrometheusBadConfig" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusBadConfig" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -70,12 +70,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheussdrefreshfailure summary: Failed Prometheus SD refresh. expr: increase(prometheus_sd_refresh_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[10m]) > 0 - for: 20m + for: {{ dig "PrometheusSDRefreshFailure" "for" "20m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusSDRefreshFailure" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -105,12 +105,12 @@ spec: > min_over_time(prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) ) - for: 15m + for: {{ dig "PrometheusNotificationQueueRunningFull" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusNotificationQueueRunningFull" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -140,12 +140,12 @@ spec: ) * 100 > 1 - for: 15m + for: {{ dig "PrometheusErrorSendingAlertsToSomeAlertmanagers" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusErrorSendingAlertsToSomeAlertmanagers" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -171,12 +171,12 @@ spec: # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) < 1 - for: 10m + for: {{ dig "PrometheusNotConnectedToAlertmanagers" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusNotConnectedToAlertmanagers" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -199,12 +199,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustsdbreloadsfailing summary: Prometheus has issues reloading blocks from disk. expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 - for: 4h + for: {{ dig "PrometheusTSDBReloadsFailing" "for" "4h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusTSDBReloadsFailing" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -227,12 +227,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustsdbcompactionsfailing summary: Prometheus has issues compacting blocks. expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0 - for: 4h + for: {{ dig "PrometheusTSDBCompactionsFailing" "for" "4h" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusTSDBCompactionsFailing" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -256,7 +256,7 @@ spec: summary: Prometheus is not ingesting samples. expr: |- ( - rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0 + sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])) <= 0 and ( sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0 @@ -264,12 +264,12 @@ spec: sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0 ) ) - for: 10m + for: {{ dig "PrometheusNotIngestingSamples" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusNotIngestingSamples" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -292,12 +292,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusduplicatetimestamps summary: Prometheus is dropping samples with duplicate timestamps. expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m + for: {{ dig "PrometheusDuplicateTimestamps" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusDuplicateTimestamps" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -320,12 +320,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusoutofordertimestamps summary: Prometheus drops samples with out-of-order timestamps. expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 10m + for: {{ dig "PrometheusOutOfOrderTimestamps" "for" "10m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusOutOfOrderTimestamps" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -359,12 +359,12 @@ spec: ) * 100 > 1 - for: 15m + for: {{ dig "PrometheusRemoteStorageFailures" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusRemoteStorageFailures" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -395,12 +395,12 @@ spec: max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) ) > 120 - for: 15m + for: {{ dig "PrometheusRemoteWriteBehind" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusRemoteWriteBehind" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -430,12 +430,12 @@ spec: > max_over_time(prometheus_remote_storage_shards_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) ) - for: 15m + for: {{ dig "PrometheusRemoteWriteDesiredShards" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusRemoteWriteDesiredShards" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -458,12 +458,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusrulefailures summary: Prometheus is failing rule evaluations. expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusRuleFailures" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusRuleFailures" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -486,12 +486,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusmissingruleevaluations summary: Prometheus is missing rule evaluations due to slow rule group evaluation. expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusMissingRuleEvaluations" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusMissingRuleEvaluations" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -514,12 +514,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustargetlimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusTargetLimitHit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusTargetLimitHit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -542,12 +542,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuslabellimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusLabelLimitHit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusLabelLimitHit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -570,12 +570,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusscrapebodysizelimithit summary: Prometheus has dropped some targets that exceeded body size limit. expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusScrapeBodySizeLimitHit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusScrapeBodySizeLimitHit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -598,12 +598,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusscrapesamplelimithit summary: Prometheus has failed scrapes that have exceeded the configured sample limit. expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0 - for: 15m + for: {{ dig "PrometheusScrapeSampleLimitHit" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusScrapeSampleLimitHit" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -626,12 +626,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustargetsyncfailure summary: Prometheus has failed to sync targets. expr: increase(prometheus_target_sync_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[30m]) > 0 - for: 5m + for: {{ dig "PrometheusTargetSyncFailure" "for" "5m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusTargetSyncFailure" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -654,12 +654,12 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheushighqueryload summary: Prometheus is reaching its maximum capacity serving concurrent requests. expr: avg_over_time(prometheus_engine_queries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.8 - for: 15m + for: {{ dig "PrometheusHighQueryLoad" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: warning + severity: {{ dig "PrometheusHighQueryLoad" "severity" "warning" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -689,12 +689,12 @@ spec: ) * 100 > 3 - for: 15m + for: {{ dig "PrometheusErrorSendingAlertsToAnyAlertmanager" "for" "15m" .Values.customRules }} {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" {{- end }} labels: - severity: critical + severity: {{ dig "PrometheusErrorSendingAlertsToAnyAlertmanager" "severity" "critical" .Values.customRules }} {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.prometheus }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} diff --git a/charts/kube-prometheus-stack/values.yaml b/charts/kube-prometheus-stack/values.yaml index 529f5d048484..6a7cb06e2cdd 100644 --- a/charts/kube-prometheus-stack/values.yaml +++ b/charts/kube-prometheus-stack/values.yaml @@ -33,6 +33,15 @@ commonLabels: {} crds: enabled: true +## custom Rules to override "for" and "severity" in defaultRules +## +customRules: {} + # AlertmanagerFailedReload: + # for: 3m + # AlertmanagerMembersInconsistent: + # for: 5m + # severity: "warning" + ## Create default rules for monitoring the cluster ## defaultRules: From d3a05be806efae36afe9d1fc25d569183106094b Mon Sep 17 00:00:00 2001 From: zhouttong Date: Thu, 22 Feb 2024 21:12:52 +0800 Subject: [PATCH 10/16] [prometheus-pushgateway] Fix the notes for ClusterIP (#4265) (#4275) Signed-off-by: zhoutong12589 --- charts/prometheus-pushgateway/Chart.yaml | 2 +- charts/prometheus-pushgateway/templates/NOTES.txt | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/charts/prometheus-pushgateway/Chart.yaml b/charts/prometheus-pushgateway/Chart.yaml index 1c16403bd669..a0e2af2f58f7 100644 --- a/charts/prometheus-pushgateway/Chart.yaml +++ b/charts/prometheus-pushgateway/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 appVersion: "v1.7.0" description: A Helm chart for prometheus pushgateway name: prometheus-pushgateway -version: 2.7.0 +version: 2.7.1 home: https://github.com/prometheus/pushgateway sources: - https://github.com/prometheus/pushgateway diff --git a/charts/prometheus-pushgateway/templates/NOTES.txt b/charts/prometheus-pushgateway/templates/NOTES.txt index 0196e2b37013..263b1d8d49dd 100644 --- a/charts/prometheus-pushgateway/templates/NOTES.txt +++ b/charts/prometheus-pushgateway/templates/NOTES.txt @@ -4,16 +4,16 @@ http{{ if $.Values.ingress.tls }}s{{ end }}://{{ . }}{{ $.Values.ingress.path }} {{- end }} {{- else if contains "NodePort" .Values.service.type }} - export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus-pushgateway.fullname" . }}) - export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + export NODE_PORT=$(kubectl get --namespace {{ template "prometheus-pushgateway.namespace" . }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ template "prometheus-pushgateway.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ template "prometheus-pushgateway.namespace" . }} -o jsonpath="{.items[0].status.addresses[0].address}") echo http://$NODE_IP:$NODE_PORT {{- else if contains "LoadBalancer" .Values.service.type }} NOTE: It may take a few minutes for the LoadBalancer IP to be available. You can watch the status of by running 'kubectl get svc -w {{ template "prometheus-pushgateway.fullname" . }}' - export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ template "prometheus-pushgateway.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + export SERVICE_IP=$(kubectl get svc --namespace {{ template "prometheus-pushgateway.namespace" . }} {{ template "prometheus-pushgateway.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') echo http://$SERVICE_IP:{{ .Values.service.port }} {{- else if contains "ClusterIP" .Values.service.type }} - export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app={{ template "prometheus-pushgateway.name" . }},release={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") - echo "Visit http://127.0.0.1:9091 to use your application" + export POD_NAME=$(kubectl get pods --namespace {{ template "prometheus-pushgateway.namespace" . }} -l "app.kubernetes.io/name={{ template "prometheus-pushgateway.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") kubectl port-forward $POD_NAME 9091 + echo "Visit http://127.0.0.1:9091 to use your application" {{- end }} From 972e99da9b3bb3102bba89bf8cd125f678d71e68 Mon Sep 17 00:00:00 2001 From: Sulochan Acharya Date: Thu, 22 Feb 2024 20:52:30 +0545 Subject: [PATCH 11/16] [prometheus-rabbitmq-exporter] Allows reading User from secret (#4278) Currently the exporter allows using password from secret. This patch allows users to read username from secret as well. This is used in cases where automated username is created during install process of rabbitmq service. Signed-off-by: Sulochan Acharya --- charts/prometheus-rabbitmq-exporter/Chart.yaml | 2 +- .../templates/deployment.yaml | 9 +++++++-- charts/prometheus-rabbitmq-exporter/values.yaml | 3 +++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/charts/prometheus-rabbitmq-exporter/Chart.yaml b/charts/prometheus-rabbitmq-exporter/Chart.yaml index 0c01048e9abe..a07751509c2a 100644 --- a/charts/prometheus-rabbitmq-exporter/Chart.yaml +++ b/charts/prometheus-rabbitmq-exporter/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v1 description: Rabbitmq metrics exporter for prometheus name: prometheus-rabbitmq-exporter -version: 1.10.0 +version: 1.11.0 appVersion: v0.29.0 home: https://github.com/kbudde/rabbitmq_exporter sources: diff --git a/charts/prometheus-rabbitmq-exporter/templates/deployment.yaml b/charts/prometheus-rabbitmq-exporter/templates/deployment.yaml index 83aedf52df00..99c50e75662b 100644 --- a/charts/prometheus-rabbitmq-exporter/templates/deployment.yaml +++ b/charts/prometheus-rabbitmq-exporter/templates/deployment.yaml @@ -59,8 +59,13 @@ spec: - name: RABBIT_PASSWORD value: {{ .Values.rabbitmq.password }} {{- end }} - - {{- if .Values.rabbitmq.user }} + {{- if .Values.rabbitmq.existingUserSecret }} + - name: RABBIT_USER + valueFrom: + secretKeyRef: + name: "{{ .Values.rabbitmq.existingUserSecret }}" + key: "{{ .Values.rabbitmq.existingUserSecretKey }}" + {{- else if .Values.rabbitmq.user }} - name: RABBIT_USER value: {{ .Values.rabbitmq.user }} {{- end }} diff --git a/charts/prometheus-rabbitmq-exporter/values.yaml b/charts/prometheus-rabbitmq-exporter/values.yaml index 029e1e9b95e4..c28a0c303de3 100644 --- a/charts/prometheus-rabbitmq-exporter/values.yaml +++ b/charts/prometheus-rabbitmq-exporter/values.yaml @@ -39,6 +39,9 @@ rabbitmq: url: http://myrabbit:15672 user: guest password: guest + # If existingUserSecret is set then user is ignored + existingUserSecret: ~ + existingUserSecretKey: username # If existingPasswordSecret is set then password is ignored existingPasswordSecret: ~ existingPasswordSecretKey: password From acb2c39d90702fb9fbcff0d34501edcec20380ce Mon Sep 17 00:00:00 2001 From: Mike Tougeron Date: Thu, 22 Feb 2024 09:37:30 -0800 Subject: [PATCH 12/16] Fix the customResourceState.enabled args conditional (#4273) Signed-off-by: Mike Tougeron --- charts/kube-state-metrics/Chart.yaml | 2 +- charts/kube-state-metrics/templates/deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/kube-state-metrics/Chart.yaml b/charts/kube-state-metrics/Chart.yaml index 05da94e1dd2b..c420b344b4ba 100644 --- a/charts/kube-state-metrics/Chart.yaml +++ b/charts/kube-state-metrics/Chart.yaml @@ -7,7 +7,7 @@ keywords: - prometheus - kubernetes type: application -version: 5.16.0 +version: 5.16.1 appVersion: 2.10.1 home: https://github.com/kubernetes/kube-state-metrics/ sources: diff --git a/charts/kube-state-metrics/templates/deployment.yaml b/charts/kube-state-metrics/templates/deployment.yaml index 373f7dcc56a0..521817291be2 100644 --- a/charts/kube-state-metrics/templates/deployment.yaml +++ b/charts/kube-state-metrics/templates/deployment.yaml @@ -115,10 +115,10 @@ spec: {{- if .Values.selfMonitor.telemetryPort }} - --telemetry-port={{ $telemetryPort }} {{- end }} + {{- end }} {{- if .Values.customResourceState.enabled }} - --custom-resource-state-config-file=/etc/customresourcestate/config.yaml {{- end }} - {{- end }} {{- if or (.Values.kubeconfig.enabled) (.Values.customResourceState.enabled) (.Values.volumeMounts) }} volumeMounts: {{- if .Values.kubeconfig.enabled }} From 35b15f01cf17166d5bf1c697231e9ec0976c93ec Mon Sep 17 00:00:00 2001 From: dbeltman <36163623+dbeltman@users.noreply.github.com> Date: Thu, 22 Feb 2024 23:58:19 +0100 Subject: [PATCH 13/16] [prometheus-smartctl-exporter] Fix typo in (prometheus)-rules.txt (#4284) * Fix typo in (prometheus)-rules.txt Signed-off-by: dbeltman <36163623+dbeltman@users.noreply.github.com> * Update Chart.yaml Signed-off-by: dbeltman <36163623+dbeltman@users.noreply.github.com> --------- Signed-off-by: dbeltman <36163623+dbeltman@users.noreply.github.com> --- charts/prometheus-smartctl-exporter/Chart.yaml | 2 +- charts/prometheus-smartctl-exporter/rules/rules.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/charts/prometheus-smartctl-exporter/Chart.yaml b/charts/prometheus-smartctl-exporter/Chart.yaml index e1236b481dbf..7a13758d0263 100644 --- a/charts/prometheus-smartctl-exporter/Chart.yaml +++ b/charts/prometheus-smartctl-exporter/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.7.0 +version: 0.7.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/prometheus-smartctl-exporter/rules/rules.txt b/charts/prometheus-smartctl-exporter/rules/rules.txt index faf89055b2ed..ad249080fe0e 100644 --- a/charts/prometheus-smartctl-exporter/rules/rules.txt +++ b/charts/prometheus-smartctl-exporter/rules/rules.txt @@ -13,10 +13,10 @@ rules: for: 1m labels: severity: warning -- alert: SmartCTLDeviceAvailableSpareUnderThreadhold +- alert: SmartCTLDeviceAvailableSpareUnderThreshold expr: smartctl_device_available_spare_threshold > smartctl_device_available_spare annotations: - message: Device {{ $labels.device }} on instance {{ $labels.instance }} is under available spare threashold. + message: Device {{ $labels.device }} on instance {{ $labels.instance }} is under available spare threshold. for: 1m labels: severity: warning From 1ab1aea6fe8a0ab450294eea6c2dea7b3bbf19d5 Mon Sep 17 00:00:00 2001 From: Devin Buhl Date: Fri, 23 Feb 2024 08:25:08 -0500 Subject: [PATCH 14/16] [prometheus-snmp-exporter] default serviceMonitor to Release Namespace (#4086) * [prometheus-snmp-exporter] default serviceMonitor to Release Namespace Signed-off-by: Devin Buhl * Update values.yaml Signed-off-by: Devin Buhl * Update Chart.yaml Signed-off-by: Devin Buhl * Update Chart.yaml Signed-off-by: Devin Buhl * Update README.md Signed-off-by: Devin Buhl * Update values.yaml Signed-off-by: Devin Buhl * Update README.md Signed-off-by: Devin Buhl --------- Signed-off-by: Devin Buhl Co-authored-by: MH --- charts/prometheus-snmp-exporter/Chart.yaml | 2 +- charts/prometheus-snmp-exporter/README.md | 8 ++++++++ .../templates/servicemonitor.yaml | 4 +--- charts/prometheus-snmp-exporter/values.yaml | 3 ++- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/charts/prometheus-snmp-exporter/Chart.yaml b/charts/prometheus-snmp-exporter/Chart.yaml index 1597d22e0608..6a5e19f1eb7a 100644 --- a/charts/prometheus-snmp-exporter/Chart.yaml +++ b/charts/prometheus-snmp-exporter/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v1 description: Prometheus SNMP Exporter name: prometheus-snmp-exporter -version: 1.8.2 +version: 2.0.0 appVersion: v0.21.0 home: https://github.com/prometheus/snmp_exporter sources: diff --git a/charts/prometheus-snmp-exporter/README.md b/charts/prometheus-snmp-exporter/README.md index a3e66b989f61..b29a1f7869a0 100644 --- a/charts/prometheus-snmp-exporter/README.md +++ b/charts/prometheus-snmp-exporter/README.md @@ -45,6 +45,10 @@ helm upgrade [RELEASE_NAME] [CHART] --install _See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade/) for command documentation._ +### Upgrading an existing Release to a new major version + +A major chart version change (like v1.2.3 -> v2.0.0) indicates that there is an incompatible breaking change needing manual actions. + ### To 1.0.0 This version allows multiple Targets to be specified when using ServiceMonitor. When you use ServiceMonitor, please rewrite below: @@ -73,6 +77,10 @@ serviceMonitor: target: 127.0.0.1 ``` +### To 2.0.0 + +This version changes the `serviceMonitor.namespace` value from `monitoring` to the namespace the release is deployed to. + ## Configuration See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_helm/#customizing-the-chart-before-installing). To see all configurable options with detailed comments, visit the chart's [values.yaml](./values.yaml), or run these configuration commands: diff --git a/charts/prometheus-snmp-exporter/templates/servicemonitor.yaml b/charts/prometheus-snmp-exporter/templates/servicemonitor.yaml index 25e8406b0c42..b7eefb80943d 100644 --- a/charts/prometheus-snmp-exporter/templates/servicemonitor.yaml +++ b/charts/prometheus-snmp-exporter/templates/servicemonitor.yaml @@ -5,9 +5,7 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: {{ template "prometheus-snmp-exporter.fullname" $ }}-{{ .name }} - {{- if $.Values.serviceMonitor.namespace }} - namespace: {{ $.Values.serviceMonitor.namespace }} - {{- end }} + namespace: {{ $.Values.serviceMonitor.namespace | default $.Release.Namespace }} labels: {{- include "prometheus-snmp-exporter.labels" $ | indent 4 }} {{- range $key, $value := .labels | default $.Values.serviceMonitor.selector }} diff --git a/charts/prometheus-snmp-exporter/values.yaml b/charts/prometheus-snmp-exporter/values.yaml index 3826babe7dc6..07684eeb23d1 100644 --- a/charts/prometheus-snmp-exporter/values.yaml +++ b/charts/prometheus-snmp-exporter/values.yaml @@ -146,7 +146,8 @@ configmapReload: # A service monitor will be created per each item in serviceMonitor.params[] serviceMonitor: enabled: false - namespace: monitoring + # Default value is the namespace the release is deployed to + # namespace: monitoring path: /snmp From dfda262421be6387ca2b38e23da50d6ff1c6fd05 Mon Sep 17 00:00:00 2001 From: Sebastian Gaiser Date: Fri, 23 Feb 2024 14:54:32 +0100 Subject: [PATCH 15/16] [prometheus-snmp-exporter] make strategy configurable (#4241) Signed-off-by: Sebastian Gaiser --- charts/prometheus-snmp-exporter/Chart.yaml | 2 +- charts/prometheus-snmp-exporter/templates/deployment.yaml | 5 +---- charts/prometheus-snmp-exporter/values.yaml | 6 ++++++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/charts/prometheus-snmp-exporter/Chart.yaml b/charts/prometheus-snmp-exporter/Chart.yaml index 6a5e19f1eb7a..d0b56e936b4a 100644 --- a/charts/prometheus-snmp-exporter/Chart.yaml +++ b/charts/prometheus-snmp-exporter/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v1 description: Prometheus SNMP Exporter name: prometheus-snmp-exporter -version: 2.0.0 +version: 2.1.0 appVersion: v0.21.0 home: https://github.com/prometheus/snmp_exporter sources: diff --git a/charts/prometheus-snmp-exporter/templates/deployment.yaml b/charts/prometheus-snmp-exporter/templates/deployment.yaml index 8c9a60539277..54c58a05ea35 100644 --- a/charts/prometheus-snmp-exporter/templates/deployment.yaml +++ b/charts/prometheus-snmp-exporter/templates/deployment.yaml @@ -12,10 +12,7 @@ spec: matchLabels: {{- include "prometheus-snmp-exporter.selectorLabels" . | indent 6 }} strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 0 - type: RollingUpdate +{{ toYaml .Values.strategy | indent 4 }} template: metadata: labels: diff --git a/charts/prometheus-snmp-exporter/values.yaml b/charts/prometheus-snmp-exporter/values.yaml index 07684eeb23d1..e608a6e20946 100644 --- a/charts/prometheus-snmp-exporter/values.yaml +++ b/charts/prometheus-snmp-exporter/values.yaml @@ -209,3 +209,9 @@ extraManifests: [] # name: prometheus-extra # data: # extra-data: "value" + +strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + type: RollingUpdate From 3173f0f0b3c040ac75617c5e8497dc479afc11f0 Mon Sep 17 00:00:00 2001 From: zeritti <47476160+zeritti@users.noreply.github.com> Date: Sat, 24 Feb 2024 01:24:05 +0000 Subject: [PATCH 16/16] [prometheus-node-exporter] Fix unclosed action in daemonset template (#4280) * [prometheus-node-exporter] Remove new lines within action Signed-off-by: zeritti <47476160+zeritti@users.noreply.github.com> * Raise chart version Signed-off-by: zeritti <47476160+zeritti@users.noreply.github.com> --------- Signed-off-by: zeritti <47476160+zeritti@users.noreply.github.com> Co-authored-by: MH --- charts/prometheus-node-exporter/Chart.yaml | 2 +- .../prometheus-node-exporter/templates/daemonset.yaml | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/charts/prometheus-node-exporter/Chart.yaml b/charts/prometheus-node-exporter/Chart.yaml index 4c1a176202e8..7e88090e7ddc 100644 --- a/charts/prometheus-node-exporter/Chart.yaml +++ b/charts/prometheus-node-exporter/Chart.yaml @@ -6,7 +6,7 @@ keywords: - prometheus - exporter type: application -version: 4.30.2 +version: 4.30.3 appVersion: 1.7.0 home: https://github.com/prometheus/node_exporter/ sources: diff --git a/charts/prometheus-node-exporter/templates/daemonset.yaml b/charts/prometheus-node-exporter/templates/daemonset.yaml index 152ec7dc118d..f0e345b972b8 100644 --- a/charts/prometheus-node-exporter/templates/daemonset.yaml +++ b/charts/prometheus-node-exporter/templates/daemonset.yaml @@ -176,14 +176,8 @@ spec: mountPath: {{ .mountPath }} {{- end }} {{- range .Values.sidecars }} - {{- $overwrites := dict - "volumeMounts" (concat (include "prometheus-node-exporter.sidecarVolumeMounts" $ | fromYamlArray) (.volumeMounts | default list) | default list) - }} - {{- $defaults := dict - "image" (include "prometheus-node-exporter.image" $) - "securityContext" $.Values.containerSecurityContext - "imagePullPolicy" $.Values.image.pullPolicy - }} + {{- $overwrites := dict "volumeMounts" (concat (include "prometheus-node-exporter.sidecarVolumeMounts" $ | fromYamlArray) (.volumeMounts | default list) | default list) }} + {{- $defaults := dict "image" (include "prometheus-node-exporter.image" $) "securityContext" $.Values.containerSecurityContext "imagePullPolicy" $.Values.image.pullPolicy }} - {{- toYaml (merge $overwrites . $defaults) | nindent 10 }} {{- end }} {{- if .Values.kubeRBACProxy.enabled }}