From c7ccb04625d38f69272769aac0878a023aa3605b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Otto=20Kr=C3=B6pke?= Date: Wed, 22 Nov 2023 10:51:15 +0100 Subject: [PATCH] [kube-prometheus-stack] Configure additional aggregation labels + k8s.rules split (#3883) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [kube-prometheus-stack] Configure additional aggregation labels Signed-off-by: Jan-Otto Kröpke * Re-run sync Signed-off-by: Jan-Otto Kröpke * Re-run sync Signed-off-by: Jan-Otto Kröpke * Add new values for rules Signed-off-by: Jan-Otto Kröpke * Add new values for rules Signed-off-by: Jan-Otto Kröpke * Add new values for rules Signed-off-by: Jan-Otto Kröpke * [kube-prometheus-stack] Fix names Signed-off-by: Jan-Otto Kröpke * bump version Signed-off-by: Jan-Otto Kröpke --------- Signed-off-by: Jan-Otto Kröpke Signed-off-by: Jan-Otto Kröpke Signed-off-by: Jan-Otto Kröpke Co-authored-by: Gabriel Martinez <19713226+GMartinez-Sisti@users.noreply.github.com> Signed-off-by: Sacha --- charts/kube-prometheus-stack/Chart.yaml | 2 +- .../hack/sync_prometheus_rules.py | 20 +- .../templates/prometheus/_rules.tpl | 8 +- .../rules-1.14/alertmanager.rules.yaml | 20 +- .../prometheus/rules-1.14/general.rules.yaml | 2 +- ...les.container_cpu_usage_seconds_total.yaml | 43 +++ .../k8s.rules.container_memory_cache.yaml | 42 +++ .../k8s.rules.container_memory_rss.yaml | 42 +++ .../k8s.rules.container_memory_swap.yaml | 42 +++ ...es.container_memory_working_set_bytes.yaml | 42 +++ .../k8s.rules.container_resource.yaml | 167 +++++++++ .../rules-1.14/k8s.rules.pod_owner.yaml | 106 ++++++ .../prometheus/rules-1.14/k8s.rules.yaml | 322 ------------------ .../kube-apiserver-availability.rules.yaml | 60 ++-- .../kube-apiserver-burnrate.rules.yaml | 140 ++++---- .../kube-apiserver-histogram.rules.yaml | 4 +- .../rules-1.14/kube-state-metrics.yaml | 14 +- .../prometheus/rules-1.14/kubelet.rules.yaml | 6 +- .../rules-1.14/kubernetes-apps.yaml | 12 +- .../rules-1.14/kubernetes-resources.yaml | 20 +- .../rules-1.14/kubernetes-storage.yaml | 16 +- .../kubernetes-system-apiserver.yaml | 8 +- .../rules-1.14/kubernetes-system-kubelet.yaml | 10 +- .../rules-1.14/kubernetes-system.yaml | 6 +- .../prometheus/rules-1.14/node.rules.yaml | 16 +- .../rules-1.14/prometheus-operator.yaml | 8 +- charts/kube-prometheus-stack/values.yaml | 23 +- 27 files changed, 700 insertions(+), 501 deletions(-) create mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml create mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_cache.yaml create mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_rss.yaml create mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_swap.yaml create mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_working_set_bytes.yaml create mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml create mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml delete mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index 5233e847fe08..d367c59621c2 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -21,7 +21,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 54.1.0 +version: 54.2.0 appVersion: v0.69.1 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus diff --git a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py index 75b796f3e12f..b7f6a5de091c 100755 --- a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py +++ b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py @@ -94,7 +94,13 @@ def new_representer(dumper, data): 'config-reloaders': ' .Values.defaultRules.rules.configReloaders', 'etcd': ' .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd', 'general.rules': ' .Values.defaultRules.rules.general', - 'k8s.rules': ' .Values.defaultRules.rules.k8s', + 'k8s.rules.container_cpu_usage_seconds_total': ' .Values.defaultRules.rules.k8sContainerCpuUsageSecondsTotal', + 'k8s.rules.container_memory_cache': ' .Values.defaultRules.rules.k8sContainerMemoryCache', + 'k8s.rules.container_memory_rss': ' .Values.defaultRules.rules.k8sContainerMemoryRss', + 'k8s.rules.container_memory_swap': ' .Values.defaultRules.rules.k8sContainerMemorySwap', + 'k8s.rules.container_memory_working_set_bytes': ' .Values.defaultRules.rules.k8sContainerMemoryWorkingSetBytes', + 'k8s.rules.container_resource': ' .Values.defaultRules.rules.k8sContainerResource', + 'k8s.rules.pod_owner': ' .Values.defaultRules.rules.k8sPodOwner', 'kube-apiserver-availability.rules': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverAvailability', 'kube-apiserver-burnrate.rules': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverBurnrate', 'kube-apiserver-histogram.rules': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverHistogram', @@ -417,7 +423,7 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes) rules = add_rules_per_rule_conditions(rules, group) # initialize header lines = header % { - 'name': group['name'], + 'name': sanitize_name(group['name']), 'url': url, 'condition': condition_map.get(group['name'], ''), 'init_line': init_line, @@ -426,7 +432,11 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes) } # rules themselves - lines += rules + lines += re.sub( + r'\s(by|on) ?\(', + r' \1 ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}', + rules + ) # footer lines += '{{- end }}' @@ -541,6 +551,10 @@ def main(): print("Finished") +def sanitize_name(name): + return re.sub('[_]', '-', name).lower() + + def jsonnet_import_callback(base, rel): if "github.com" in base: base = os.getcwd() + '/vendor/' + base[base.find('github.com'):] diff --git a/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl b/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl index e74955c7a8f9..5fe564f3ea9b 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl +++ b/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl @@ -8,7 +8,13 @@ rules: - "config-reloaders" - "etcd" - "general.rules" - - "k8s.rules" + - "k8s.rules.container_cpu_usage_seconds_total" + - "k8s.rules.container_memory_cache" + - "k8s.rules.container_memory_rss" + - "k8s.rules.container_memory_swap" + - "k8s.rules.container_memory_working_set_bytes" + - "k8s.rules.container_resource" + - "k8s.rules.pod_owner" - "kube-apiserver-availability.rules" - "kube-apiserver-burnrate.rules" - "kube-apiserver-histogram.rules" diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml index 30ec6c0be2e8..c306dc6c18ad 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml @@ -73,8 +73,8 @@ spec: # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) - < on (namespace,service,cluster) group_left - count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])) + < on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) group_left + count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])) for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -137,7 +137,7 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterfailedtosendalerts summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. expr: |- - min by (namespace,service, integration) ( + min by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service, integration) ( rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) / ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) @@ -171,7 +171,7 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterfailedtosendalerts summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. expr: |- - min by (namespace,service, integration) ( + min by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service, integration) ( rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) / ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) @@ -205,8 +205,8 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerconfiginconsistent summary: Alertmanager instances within the same cluster have different configurations. expr: |- - count by (namespace,service,cluster) ( - count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) + count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ( + count_values by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) ) != 1 for: 20m @@ -238,11 +238,11 @@ spec: summary: Half or more of the Alertmanager instances within the same cluster are down. expr: |- ( - count by (namespace,service,cluster) ( + count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ( avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5 ) / - count by (namespace,service,cluster) ( + count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ( up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} ) ) @@ -276,11 +276,11 @@ spec: summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. expr: |- ( - count by (namespace,service,cluster) ( + count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ( changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4 ) / - count by (namespace,service,cluster) ( + count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ( up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} ) ) diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml index 8139fcaed93b..a62db1f06bc5 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml @@ -110,7 +110,7 @@ spec: ' runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/infoinhibitor summary: Info-level alert inhibition. - expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 + expr: ALERTS{severity = "info"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 labels: severity: none {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml new file mode 100644 index 000000000000..ab4de1cb2d42 --- /dev/null +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml @@ -0,0 +1,43 @@ +{{- /* +Generated from 'k8s.rules.container-cpu-usage-seconds-total' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerCpuUsageSecondsTotal }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-cpu-usage-seconds-total" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "kube-prometheus-stack.namespace" . }} + labels: + app: {{ template "kube-prometheus-stack.name" . }} +{{ include "kube-prometheus-stack.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: k8s.rules.container_cpu_usage_seconds_total + rules: + - expr: |- + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, container) ( + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) ( + 1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerCpuUsageSecondsTotal }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerCpuUsageSecondsTotal }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_cache.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_cache.yaml new file mode 100644 index 000000000000..73161d13b28e --- /dev/null +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_cache.yaml @@ -0,0 +1,42 @@ +{{- /* +Generated from 'k8s.rules.container-memory-cache' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemoryCache }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-cache" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "kube-prometheus-stack.namespace" . }} + labels: + app: {{ template "kube-prometheus-stack.name" . }} +{{ include "kube-prometheus-stack.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: k8s.rules.container_memory_cache + rules: + - expr: |- + container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1, + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_cache + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryCache }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryCache }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_rss.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_rss.yaml new file mode 100644 index 000000000000..291d034b392b --- /dev/null +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_rss.yaml @@ -0,0 +1,42 @@ +{{- /* +Generated from 'k8s.rules.container-memory-rss' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemoryRss }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-rss" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "kube-prometheus-stack.namespace" . }} + labels: + app: {{ template "kube-prometheus-stack.name" . }} +{{ include "kube-prometheus-stack.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: k8s.rules.container_memory_rss + rules: + - expr: |- + container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1, + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_rss + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryRss }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryRss }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_swap.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_swap.yaml new file mode 100644 index 000000000000..03091749db53 --- /dev/null +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_swap.yaml @@ -0,0 +1,42 @@ +{{- /* +Generated from 'k8s.rules.container-memory-swap' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemorySwap }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-swap" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "kube-prometheus-stack.namespace" . }} + labels: + app: {{ template "kube-prometheus-stack.name" . }} +{{ include "kube-prometheus-stack.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: k8s.rules.container_memory_swap + rules: + - expr: |- + container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1, + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_swap + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemorySwap }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemorySwap }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_working_set_bytes.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_working_set_bytes.yaml new file mode 100644 index 000000000000..00a82c1d9260 --- /dev/null +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_working_set_bytes.yaml @@ -0,0 +1,42 @@ +{{- /* +Generated from 'k8s.rules.container-memory-working-set-bytes' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemoryWorkingSetBytes }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-working-set-bytes" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "kube-prometheus-stack.namespace" . }} + labels: + app: {{ template "kube-prometheus-stack.name" . }} +{{ include "kube-prometheus-stack.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: k8s.rules.container_memory_working_set_bytes + rules: + - expr: |- + container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1, + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryWorkingSetBytes }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryWorkingSetBytes }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml new file mode 100644 index 000000000000..74f6ead91783 --- /dev/null +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml @@ -0,0 +1,167 @@ +{{- /* +Generated from 'k8s.rules.container-resource' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerResource }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-resource" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "kube-prometheus-stack.namespace" . }} + labels: + app: {{ template "kube-prometheus-stack.name" . }} +{{ include "kube-prometheus-stack.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: k8s.rules.container_resource + rules: + - expr: |- + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) + group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: |- + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) ( + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_requests:sum + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: |- + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) + group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: |- + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) ( + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_requests:sum + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: |- + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) + group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: |- + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) ( + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: |- + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) + group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: |- + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) ( + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerResource }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml new file mode 100644 index 000000000000..602d4692e039 --- /dev/null +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml @@ -0,0 +1,106 @@ +{{- /* +Generated from 'k8s.rules.pod-owner' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sPodOwner }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.pod-owner" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "kube-prometheus-stack.namespace" . }} + labels: + app: {{ template "kube-prometheus-stack.name" . }} +{{ include "kube-prometheus-stack.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: k8s.rules.pod_owner + rules: + - expr: |- + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace) group_left(owner_name) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace) ( + 1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: deployment + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: |- + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: daemonset + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: |- + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: statefulset + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: |- + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: job + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.k8sPodOwner }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: namespace_workload_pod:kube_pod_owner:relabel +{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml deleted file mode 100644 index f697e2b994ed..000000000000 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.yaml +++ /dev/null @@ -1,322 +0,0 @@ -{{- /* -Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml -Do not change in-place! In order to change this file first read following link: -https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack -*/ -}} -{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} -{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }} -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }} - namespace: {{ template "kube-prometheus-stack.namespace" . }} - labels: - app: {{ template "kube-prometheus-stack.name" . }} -{{ include "kube-prometheus-stack.labels" . | indent 4 }} -{{- if .Values.defaultRules.labels }} -{{ toYaml .Values.defaultRules.labels | indent 4 }} -{{- end }} -{{- if .Values.defaultRules.annotations }} - annotations: -{{ toYaml .Values.defaultRules.annotations | indent 4 }} -{{- end }} -spec: - groups: - - name: k8s.rules - rules: - - expr: |- - sum by (cluster, namespace, pod, container) ( - irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) - ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( - 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, - max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_working_set_bytes - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, - max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_rss - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, - max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_cache - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, - max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_swap - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod, cluster) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} - ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_memory:kube_pod_container_resource_requests:sum - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod, cluster) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} - ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_cpu:kube_pod_container_resource_requests:sum - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod, cluster) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} - ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_memory:kube_pod_container_resource_limits:sum - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) - group_left() max by (namespace, pod, cluster) ( - (kube_pod_status_phase{phase=~"Pending|Running"} == 1) - ) - record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - sum by (namespace, cluster) ( - sum by (namespace, pod, cluster) ( - max by (namespace, pod, container, cluster) ( - kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} - ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_cpu:kube_pod_container_resource_limits:sum - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - labels: - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, - "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( - 1, max by (replicaset, namespace, owner_name) ( - kube_replicaset_owner{job="kube-state-metrics"} - ) - ), - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: deployment - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: daemonset - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: statefulset - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: |- - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) - labels: - workload_type: job - {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- with .Values.defaultRules.additionalRuleLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.defaultRules.additionalRuleGroupLabels.k8s }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - record: namespace_workload_pod:kube_pod_owner:relabel -{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml index 164c76b0d9bb..ceec327e8686 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-availability.rules.yaml @@ -36,7 +36,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: verb: read {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} @@ -48,7 +48,7 @@ spec: {{- end }} {{- end }} record: code:apiserver_request_total:increase30d - - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: verb: write {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} @@ -60,7 +60,7 @@ spec: {{- end }} {{- end }} record: code:apiserver_request_total:increase30d - - expr: sum by (cluster, verb, scope) (increase(apiserver_request_sli_duration_seconds_count{job="apiserver"}[1h])) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, verb, scope) (increase(apiserver_request_sli_duration_seconds_count{job="apiserver"}[1h])) record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} labels: @@ -71,7 +71,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d]) * 24 * 30) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d]) * 24 * 30) record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} labels: @@ -82,7 +82,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h])) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h])) record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} labels: @@ -93,7 +93,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30) record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} labels: @@ -108,31 +108,31 @@ spec: 1 - ( ( # write too slow - sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) ) + ( # read too slow - sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"}) - ( ( - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) or vector(0) ) + - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) ) ) + # errors - sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) ) / - sum by (cluster) (code:apiserver_request_total:increase30d) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d) labels: verb: all {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} @@ -146,26 +146,26 @@ spec: record: apiserver_request:availability30d - expr: |- 1 - ( - sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"}) - ( # too slow ( - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) or vector(0) ) + - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) ) + # errors - sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) ) / - sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d{verb="read"}) labels: verb: read {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} @@ -181,16 +181,16 @@ spec: 1 - ( ( # too slow - sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) ) + # errors - sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) ) / - sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (code:apiserver_request_total:increase30d{verb="write"}) labels: verb: write {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} @@ -202,7 +202,7 @@ spec: {{- end }} {{- end }} record: apiserver_request:availability30d - - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} @@ -214,7 +214,7 @@ spec: {{- end }} {{- end }} record: code_resource:apiserver_request_total:rate5m - - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} @@ -226,7 +226,7 @@ spec: {{- end }} {{- end }} record: code_resource:apiserver_request_total:rate5m - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} labels: @@ -237,7 +237,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} labels: @@ -248,7 +248,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} labels: @@ -259,7 +259,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverAvailability }} labels: diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-burnrate.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-burnrate.rules.yaml index 6966cbb061ec..369a0ae1b448 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-burnrate.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-burnrate.rules.yaml @@ -28,26 +28,26 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d])) - ( ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d])) or vector(0) ) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d])) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d])) ) ) + # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) labels: verb: read {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -63,26 +63,26 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h])) - ( ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h])) or vector(0) ) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h])) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h])) ) ) + # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) labels: verb: read {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -98,26 +98,26 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h])) - ( ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h])) or vector(0) ) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h])) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h])) ) ) + # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) labels: verb: read {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -133,26 +133,26 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m])) - ( ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m])) or vector(0) ) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m])) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m])) ) ) + # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) labels: verb: read {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -168,26 +168,26 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d])) - ( ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d])) or vector(0) ) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d])) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d])) ) ) + # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) labels: verb: read {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -203,26 +203,26 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])) - ( ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m])) or vector(0) ) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m])) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m])) ) ) + # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -238,26 +238,26 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h])) - ( ( - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h])) or vector(0) ) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h])) + - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h])) ) ) + # errors - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) labels: verb: read {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -273,15 +273,15 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d])) - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d])) ) + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) labels: verb: write {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -297,15 +297,15 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h])) - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h])) ) + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) labels: verb: write {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -321,15 +321,15 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h])) - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h])) ) + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) labels: verb: write {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -345,15 +345,15 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m])) - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m])) ) + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) labels: verb: write {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -369,15 +369,15 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d])) - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d])) ) + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) labels: verb: write {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -393,15 +393,15 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])) - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m])) ) + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} @@ -417,15 +417,15 @@ spec: ( ( # too slow - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h])) - - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h])) ) + - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) ) / - sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) labels: verb: write {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeApiserverBurnrate }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-histogram.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-histogram.rules.yaml index 5aa040842799..e269cf33cd1c 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-histogram.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-apiserver-histogram.rules.yaml @@ -24,7 +24,7 @@ spec: groups: - name: kube-apiserver-histogram.rules rules: - - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + - expr: histogram_quantile(0.99, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 labels: quantile: '0.99' verb: read @@ -37,7 +37,7 @@ spec: {{- end }} {{- end }} record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + - expr: histogram_quantile(0.99, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 labels: quantile: '0.99' verb: write diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml index 55ca01658aeb..426cb3b3effb 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml @@ -37,9 +37,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors summary: kube-state-metrics is experiencing errors in list operations. expr: |- - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster)) + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0.01 for: 15m {{- with .Values.defaultRules.keepFiringFor }} @@ -69,9 +69,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors summary: kube-state-metrics is experiencing errors in watch operations. expr: |- - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0.01 for: 15m {{- with .Values.defaultRules.keepFiringFor }} @@ -100,7 +100,7 @@ spec: description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch summary: kube-state-metrics sharding is misconfigured. - expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0 + expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -129,9 +129,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing summary: kube-state-metrics shards are missing. expr: |- - 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1 + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - 1 - - sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster) + sum( 2 ^ max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0 for: 15m {{- with .Values.defaultRules.keepFiringFor }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml index 891af33dcbb3..795dacd9a3aa 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml @@ -24,7 +24,7 @@ spec: groups: - name: kubelet.rules rules: - - expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + - expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) labels: quantile: '0.99' {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubelet }} @@ -36,7 +36,7 @@ spec: {{- end }} {{- end }} record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + - expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) labels: quantile: '0.9' {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubelet }} @@ -48,7 +48,7 @@ spec: {{- end }} {{- end }} record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + - expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) labels: quantile: '0.5' {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubelet }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml index 513a1f309eac..d3689fbec175 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml @@ -66,11 +66,11 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodnotready summary: Pod has been in a non-ready state for more than 15 minutes. expr: |- - sum by (namespace, pod, cluster) ( - max by(namespace, pod, cluster) ( + sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown|Failed"} - ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) ( - 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) + ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left(owner_kind) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( + 1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) ) ) > 0 for: 15m @@ -362,7 +362,7 @@ spec: description: pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting summary: Pod container waiting longer than 1 hour - expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0 + expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0 for: 1h {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -450,7 +450,7 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobnotcompleted summary: Job did not complete in time expr: |- - time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + time() - max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} and kube_job_status_active{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0) > 43200 labels: diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml index 26d4de6f1d23..8b7e965fd6af 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml @@ -37,9 +37,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit summary: Cluster has overcommitted CPU resource requests. expr: |- - sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 and - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 for: 10m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -68,9 +68,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit summary: Cluster has overcommitted memory resource requests. expr: |- - sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 and - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 + (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 for: 10m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -99,9 +99,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit summary: Cluster has overcommitted CPU resource requests. expr: |- - sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster) + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) / - sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) + sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) > 1.5 for: 5m {{- with .Values.defaultRules.keepFiringFor }} @@ -131,9 +131,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit summary: Cluster has overcommitted memory resource requests. expr: |- - sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster) + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) / - sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) + sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) > 1.5 for: 5m {{- with .Values.defaultRules.keepFiringFor }} @@ -259,9 +259,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/cputhrottlinghigh summary: Processes experience elevated CPU throttling. expr: |- - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, container, pod, namespace) / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) + sum(increase(container_cpu_cfs_periods_total{}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, container, pod, namespace) > ( 25 / 100 ) for: 15m {{- with .Values.defaultRules.keepFiringFor }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml index 631af0e18b79..050c0cb60955 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml @@ -45,9 +45,9 @@ spec: ) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0 - unless on(namespace, persistentvolumeclaim) + unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on(namespace, persistentvolumeclaim) + unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1m {{- with .Values.defaultRules.keepFiringFor }} @@ -86,9 +86,9 @@ spec: kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 - unless on(namespace, persistentvolumeclaim) + unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on(namespace, persistentvolumeclaim) + unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1h {{- with .Values.defaultRules.keepFiringFor }} @@ -125,9 +125,9 @@ spec: ) < 0.03 and kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0 - unless on(namespace, persistentvolumeclaim) + unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on(namespace, persistentvolumeclaim) + unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1m {{- with .Values.defaultRules.keepFiringFor }} @@ -166,9 +166,9 @@ spec: kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 - unless on(namespace, persistentvolumeclaim) + unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on(namespace, persistentvolumeclaim) + unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1h {{- with .Values.defaultRules.keepFiringFor }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml index bf4f27a0d200..30601baa59e5 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml @@ -36,7 +36,7 @@ spec: description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job) histogram_quantile(0.01, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 for: 5m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -64,7 +64,7 @@ spec: description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. - expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job) histogram_quantile(0.01, sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 for: 5m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -92,7 +92,7 @@ spec: description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapierrors summary: Kubernetes aggregated API has reported errors. - expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesSystem }} @@ -116,7 +116,7 @@ spec: description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapidown summary: Kubernetes aggregated API is down. - expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85 + expr: (1 - max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85 for: 5m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml index 643ec0f3fb5f..5a5de7723fc9 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml @@ -93,11 +93,11 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods summary: Kubelet is running at capacity. expr: |- - count by(cluster, node) ( - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}instance,pod,namespace,cluster) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) ) / - max by(cluster, node) ( + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) ( kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 ) > 0.95 for: 15m @@ -127,7 +127,7 @@ spec: description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping summary: Node readiness status is flapping. - expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 + expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) > 2 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -183,7 +183,7 @@ spec: description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh summary: Kubelet Pod startup latency is too high. - expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, le)) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml index f75ba36ee93a..a32747686e05 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml @@ -36,7 +36,7 @@ spec: description: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch summary: Different semantic versions of Kubernetes components running. - expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 + expr: count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) (count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -65,9 +65,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclienterrors summary: Kubernetes API server client is experiencing errors. expr: |- - (sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace) + (sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, job, namespace) / - sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace)) + sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, instance, job, namespace)) > 0.01 for: 15m {{- with .Values.defaultRules.keepFiringFor }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml index ca01c6b913aa..8877f4928337 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml @@ -25,8 +25,8 @@ spec: - name: node.rules rules: - expr: |- - topk by(cluster, namespace, pod) (1, - max by (cluster, node, namespace, pod) ( + topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1, + max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node, namespace, pod) ( label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") )) record: 'node_namespace_pod:kube_pod_info:' @@ -40,10 +40,10 @@ spec: {{- end }} {{- end }} - expr: |- - count by (cluster, node) ( + count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) ( node_cpu_seconds_total{mode="idle",job="node-exporter"} - * on (namespace, pod) group_left(node) - topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:) + * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod) group_left(node) + topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod) (1, node_namespace_pod:kube_pod_info:) ) record: node:node_num_cpu:sum {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.node }} @@ -64,7 +64,7 @@ spec: node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"} ) - ) by (cluster) + ) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) record: :node_memory_MemAvailable_bytes:sum {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.node }} labels: @@ -76,7 +76,7 @@ spec: {{- end }} {{- end }} - expr: |- - avg by (cluster, node) ( + avg by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) ( sum without (mode) ( rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m]) ) @@ -92,7 +92,7 @@ spec: {{- end }} {{- end }} - expr: |- - avg by (cluster) ( + avg by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) ( node:node_cpu_utilization:ratio_rate5m ) record: cluster:node_cpu:ratio_rate5m diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml index a5d679c0134d..1e6a2a832de6 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus-operator.yaml @@ -38,7 +38,7 @@ spec: description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorlisterrors summary: Errors while performing list operations in controller. - expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4 + expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -66,7 +66,7 @@ spec: description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace. runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorwatcherrors summary: Errors while performing watch operations in controller. - expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.4 + expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m])) / sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.4 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -122,7 +122,7 @@ spec: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.' runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorreconcileerrors summary: Errors while reconciling controller. - expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1 + expr: (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1 for: 10m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -178,7 +178,7 @@ spec: description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources. runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornotready summary: Prometheus operator not ready - expr: min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) == 0) + expr: min by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) == 0) for: 5m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" diff --git a/charts/kube-prometheus-stack/values.yaml b/charts/kube-prometheus-stack/values.yaml index bd37eaa98a0e..5e181578c66e 100644 --- a/charts/kube-prometheus-stack/values.yaml +++ b/charts/kube-prometheus-stack/values.yaml @@ -42,7 +42,12 @@ defaultRules: etcd: true configReloaders: true general: true - k8s: true + k8sContainerCpuUsageSecondsTotal: true + k8sContainerMemoryCache: true + k8sContainerMemoryRss: true + k8sContainerMemorySwap: true + k8sContainerResource: true + k8sPodOwner: true kubeApiserverAvailability: true kubeApiserverBurnrate: true kubeApiserverHistogram: true @@ -90,7 +95,12 @@ defaultRules: etcd: {} configReloaders: {} general: {} - k8s: {} + k8sContainerCpuUsageSecondsTotal: {} + k8sContainerMemoryCache: {} + k8sContainerMemoryRss: {} + k8sContainerMemorySwap: {} + k8sContainerResource: {} + k8sPodOwner: {} kubeApiserverAvailability: {} kubeApiserverBurnrate: {} kubeApiserverHistogram: {} @@ -120,7 +130,12 @@ defaultRules: etcd: {} configReloaders: {} general: {} - k8s: {} + k8sContainerCpuUsageSecondsTotal: {} + k8sContainerMemoryCache: {} + k8sContainerMemoryRss: {} + k8sContainerMemorySwap: {} + k8sContainerResource: {} + k8sPodOwner: {} kubeApiserverAvailability: {} kubeApiserverBurnrate: {} kubeApiserverHistogram: {} @@ -144,6 +159,8 @@ defaultRules: prometheus: {} prometheusOperator: {} + additionalAggregationLabels: [] + ## Prefix for runbook URLs. Use this to override the first part of the runbookURLs that is common to all rules. runbookUrl: "https://runbooks.prometheus-operator.dev/runbooks"