diff --git a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py index 058ec66f66ce..a621b3d6c1ce 100755 --- a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py +++ b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py @@ -160,7 +160,10 @@ def new_representer(dumper, data): 'replacement': '$1', 'init': ''}, 'job="kube-state-metrics"': { - 'replacement': 'job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"', + 'replacement': 'job="{{ $kubeStateMetricsJob }}"', + 'init': '{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}'}, + 'job="{{ $kubeStateMetricsJob }}"': { + 'replacement': 'job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"', 'limitGroup': ['kubernetes-apps'], 'init': '{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}'}, 'job="kubelet"': { diff --git a/charts/kube-prometheus-stack/templates/_helpers.tpl b/charts/kube-prometheus-stack/templates/_helpers.tpl index 5787ec524ff2..4ea457974219 100644 --- a/charts/kube-prometheus-stack/templates/_helpers.tpl +++ b/charts/kube-prometheus-stack/templates/_helpers.tpl @@ -149,6 +149,17 @@ Use the grafana namespace override for multi-namespace deployments in combined c {{- end -}} {{- end -}} +{{/* +Allow kube-state-metrics job name to be overridden +*/}} +{{- define "kube-prometheus-stack-kube-state-metrics.name" -}} + {{- if index .Values "kube-state-metrics" "nameOverride" -}} + {{- index .Values "kube-state-metrics" "nameOverride" -}} + {{- else -}} + {{- print "kube-state-metrics" -}} + {{- end -}} +{{- end -}} + {{/* Use the kube-state-metrics namespace override for multi-namespace deployments in combined charts */}} diff --git a/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/deployment/deployment.yaml b/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/deployment/deployment.yaml index 5206f244aaaa..350810daef6c 100644 --- a/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/deployment/deployment.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/deployment/deployment.yaml @@ -26,7 +26,6 @@ spec: labels: app: {{ template "kube-prometheus-stack.name" . }}-operator-webhook {{- include "kube-prometheus-stack.prometheus-operator-webhook.labels" . | nindent 8 }} - release: {{ $.Release.Name | quote }} {{- if .Values.prometheusOperator.admissionWebhooks.deployment.podLabels }} {{ toYaml .Values.prometheusOperator.admissionWebhooks.deployment.podLabels | indent 8 }} {{- end }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml index 74f6ead91783..4cdb7d87fdd2 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerResource }} +{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -25,7 +26,7 @@ spec: - name: k8s.rules.container_resource rules: - expr: |- - kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) + kube_pod_container_resource_requests{resource="memory",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) @@ -43,7 +44,7 @@ spec: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) ( sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) ( - kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + kube_pod_container_resource_requests{resource="memory",job="{{ $kubeStateMetricsJob }}"} ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) @@ -60,7 +61,7 @@ spec: {{- end }} {{- end }} - expr: |- - kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) + kube_pod_container_resource_requests{resource="cpu",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) @@ -78,7 +79,7 @@ spec: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) ( sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) ( - kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + kube_pod_container_resource_requests{resource="cpu",job="{{ $kubeStateMetricsJob }}"} ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) @@ -95,7 +96,7 @@ spec: {{- end }} {{- end }} - expr: |- - kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) + kube_pod_container_resource_limits{resource="memory",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) @@ -113,7 +114,7 @@ spec: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) ( sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) ( - kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + kube_pod_container_resource_limits{resource="memory",job="{{ $kubeStateMetricsJob }}"} ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) @@ -130,7 +131,7 @@ spec: {{- end }} {{- end }} - expr: |- - kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) + kube_pod_container_resource_limits{resource="cpu",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) @@ -148,7 +149,7 @@ spec: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) ( sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) ( - kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + kube_pod_container_resource_limits{resource="cpu",job="{{ $kubeStateMetricsJob }}"} ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml index 602d4692e039..e06356d4e44b 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sPodOwner }} +{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -28,11 +29,11 @@ spec: max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) ( label_replace( label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace) group_left(owner_name) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace) ( 1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace, owner_name) ( - kube_replicaset_owner{job="kube-state-metrics"} + kube_replicaset_owner{job="{{ $kubeStateMetricsJob }}"} ) ), "workload", "$1", "owner_name", "(.*)" @@ -52,7 +53,7 @@ spec: - expr: |- max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) ( label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ) ) @@ -70,7 +71,7 @@ spec: - expr: |- max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) ( label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" ) ) @@ -88,7 +89,7 @@ spec: - expr: |- max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) ( label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, + kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="Job"}, "workload", "$1", "owner_name", "(.*)" ) ) diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml index 426cb3b3effb..7471bd998343 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubeStateMetrics }} +{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -37,9 +38,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors summary: kube-state-metrics is experiencing errors in list operations. expr: |- - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) + (sum(rate(kube_state_metrics_list_total{job="{{ $kubeStateMetricsJob }}",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) + sum(rate(kube_state_metrics_list_total{job="{{ $kubeStateMetricsJob }}"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0.01 for: 15m {{- with .Values.defaultRules.keepFiringFor }} @@ -69,9 +70,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors summary: kube-state-metrics is experiencing errors in watch operations. expr: |- - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) + (sum(rate(kube_state_metrics_watch_total{job="{{ $kubeStateMetricsJob }}",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) + sum(rate(kube_state_metrics_watch_total{job="{{ $kubeStateMetricsJob }}"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0.01 for: 15m {{- with .Values.defaultRules.keepFiringFor }} @@ -100,7 +101,7 @@ spec: description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch summary: kube-state-metrics sharding is misconfigured. - expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0 + expr: stdvar (kube_state_metrics_total_shards{job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -129,9 +130,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing summary: kube-state-metrics shards are missing. expr: |- - 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - 1 + 2^max(kube_state_metrics_total_shards{job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - 1 - - sum( 2 ^ max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) + sum( 2 ^ max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="{{ $kubeStateMetricsJob }}"}) ) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0 for: 15m {{- with .Values.defaultRules.keepFiringFor }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml index d3689fbec175..48845fe7bc35 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesApps }} +{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }} {{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -37,7 +38,7 @@ spec: description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff").' runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping summary: Pod is crash looping. - expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m]) >= 1 + expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[5m]) >= 1 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -68,7 +69,7 @@ spec: expr: |- sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( - kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown|Failed"} + kube_pod_status_phase{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown|Failed"} ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left(owner_kind) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) ( 1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) ) @@ -101,9 +102,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch summary: Deployment generation mismatch due to possible roll-back expr: |- - kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_deployment_status_observed_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != - kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_deployment_metadata_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -133,11 +134,11 @@ spec: summary: Deployment has not matched the expected number of replicas. expr: |- ( - kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_deployment_spec_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > - kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_deployment_status_replicas_available{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} ) and ( - changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) + changes(kube_deployment_status_replicas_updated{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[10m]) == 0 ) @@ -169,7 +170,7 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentrolloutstuck summary: Deployment rollout is not progressing. expr: |- - kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_deployment_status_condition{condition="Progressing", status="false",job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != 0 for: 15m {{- with .Values.defaultRules.keepFiringFor }} @@ -200,11 +201,11 @@ spec: summary: StatefulSet has not matched the expected number of replicas. expr: |- ( - kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_statefulset_status_replicas_ready{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != - kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_statefulset_status_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) + changes(kube_statefulset_status_replicas_updated{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[10m]) == 0 ) @@ -236,9 +237,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch summary: StatefulSet generation mismatch due to possible roll-back expr: |- - kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_statefulset_status_observed_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != - kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_statefulset_metadata_generation{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -269,18 +270,18 @@ spec: expr: |- ( max without (revision) ( - kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_statefulset_status_current_revision{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} unless - kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_statefulset_status_update_revision{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} ) * ( - kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_statefulset_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != - kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_statefulset_status_replicas_updated{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} ) ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m]) + changes(kube_statefulset_status_replicas_updated{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[5m]) == 0 ) @@ -314,24 +315,24 @@ spec: expr: |- ( ( - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_daemonset_status_current_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_daemonset_status_desired_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} ) or ( - kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_daemonset_status_number_misscheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != 0 ) or ( - kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_daemonset_status_updated_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_daemonset_status_desired_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} ) or ( - kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_daemonset_status_number_available{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_daemonset_status_desired_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} ) ) and ( - changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m]) + changes(kube_daemonset_status_updated_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[5m]) == 0 ) @@ -362,7 +363,7 @@ spec: description: pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting summary: Pod container waiting longer than 1 hour - expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0 + expr: sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}) > 0 for: 1h {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -391,9 +392,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetnotscheduled summary: DaemonSet pods are not scheduled. expr: |- - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_daemonset_status_desired_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 + kube_daemonset_status_current_number_scheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0 for: 10m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -421,7 +422,7 @@ spec: description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.' runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled summary: DaemonSet pods are misscheduled. - expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 + expr: kube_daemonset_status_number_misscheduled{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -450,9 +451,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobnotcompleted summary: Job did not complete in time expr: |- - time() - max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + time() - max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, job_name, cluster) (kube_job_status_start_time{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} and - kube_job_status_active{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0) > 43200 + kube_job_status_active{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0) > 43200 labels: severity: warning {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubernetesApps }} @@ -476,7 +477,7 @@ spec: description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed summary: Job failed to complete. - expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0 + expr: kube_job_failed{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > 0 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -505,19 +506,19 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpareplicasmismatch summary: HPA has not matched desired number of replicas. expr: |- - (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + (kube_horizontalpodautoscaler_status_desired_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} != - kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) + kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}) and - (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + (kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} > - kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) + kube_horizontalpodautoscaler_spec_min_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}) and - (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + (kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} < - kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) + kube_horizontalpodautoscaler_spec_max_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}) and - changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0 + changes(kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"}[15m]) == 0 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -546,9 +547,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpamaxedout summary: HPA is running at max replicas expr: |- - kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_horizontalpodautoscaler_status_current_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} == - kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} + kube_horizontalpodautoscaler_spec_max_replicas{job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"} for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml index 8b7e965fd6af..f0e49dc3b131 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }} +{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -37,9 +38,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit summary: Cluster has overcommitted CPU resource requests. expr: |- - sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 + sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="{{ $kubeStateMetricsJob }}",}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 and - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 + (sum(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{job="{{ $kubeStateMetricsJob }}",resource="cpu"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 for: 10m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -68,9 +69,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit summary: Cluster has overcommitted memory resource requests. expr: |- - sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - (sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 and - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 + (sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - max(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)) > 0 for: 10m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -99,9 +100,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit summary: Cluster has overcommitted CPU resource requests. expr: |- - sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) + sum(min without(resource) (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) / - sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) + sum(kube_node_status_allocatable{resource="cpu", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) > 1.5 for: 5m {{- with .Values.defaultRules.keepFiringFor }} @@ -131,9 +132,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit summary: Cluster has overcommitted memory resource requests. expr: |- - sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) + sum(min without(resource) (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard", resource=~"(memory|requests.memory)"})) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) / - sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) + sum(kube_node_status_allocatable{resource="memory", job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) > 1.5 for: 5m {{- with .Values.defaultRules.keepFiringFor }} @@ -163,9 +164,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaalmostfull summary: Namespace quota is going to be full. expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} + kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="used"} / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0) > 0.9 < 1 for: 15m {{- with .Values.defaultRules.keepFiringFor }} @@ -195,9 +196,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotafullyused summary: Namespace quota is fully used. expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} + kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="used"} / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0) == 1 for: 15m {{- with .Values.defaultRules.keepFiringFor }} @@ -227,9 +228,9 @@ spec: runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaexceeded summary: Namespace quota has exceeded the limits. expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} + kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="used"} / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + (kube_resourcequota{job="{{ $kubeStateMetricsJob }}", type="hard"} > 0) > 1 for: 15m {{- with .Values.defaultRules.keepFiringFor }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml index 9bb3d528c5da..1927c7ad4ea1 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }} +{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }} {{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule @@ -197,7 +198,7 @@ spec: description: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} on Cluster {{`{{`}} $labels.cluster {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors summary: PersistentVolume is having issues with provisioning. - expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="{{ $kubeStateMetricsJob }}"} > 0 for: 5m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml index 5a5de7723fc9..75efdd647171 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }} +{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -36,7 +37,7 @@ spec: description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.' runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready summary: Node is not ready. - expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + expr: kube_node_status_condition{job="{{ $kubeStateMetricsJob }}",condition="Ready",status="true"} == 0 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -64,7 +65,7 @@ spec: description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.' runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable summary: Node is unreachable. - expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + expr: (kube_node_spec_taint{job="{{ $kubeStateMetricsJob }}",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="{{ $kubeStateMetricsJob }}",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -94,11 +95,11 @@ spec: summary: Kubelet is running at capacity. expr: |- count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) ( - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}instance,pod,namespace,cluster) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + (kube_pod_status_phase{job="{{ $kubeStateMetricsJob }}",phase="Running"} == 1) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}instance,pod,namespace,cluster) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}instance,pod,namespace,cluster) (1, kube_pod_info{job="{{ $kubeStateMetricsJob }}"}) ) / max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) ( - kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 + kube_node_status_capacity{job="{{ $kubeStateMetricsJob }}",resource="pods"} != 1 ) > 0.95 for: 15m {{- with .Values.defaultRules.keepFiringFor }} @@ -127,7 +128,7 @@ spec: description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes. runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping summary: Node readiness status is flapping. - expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) > 2 + expr: sum(changes(kube_node_status_condition{job="{{ $kubeStateMetricsJob }}",status="true",condition="Ready"}[15m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node) > 2 for: 15m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml index 7323916816d1..71c246d685bf 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }} +{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -27,7 +28,7 @@ spec: - expr: |- topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, node, namespace, pod) ( - label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + label_replace(kube_pod_info{job="{{ $kubeStateMetricsJob }}",node!=""}, "pod", "$1", "pod", "(.*)") )) record: 'node_namespace_pod:kube_pod_info:' {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.node }} diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/windows.pod.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/windows.pod.rules.yaml index 495004aa2f75..86340b5c0573 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/windows.pod.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/windows.pod.rules.yaml @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.windowsMonitoring.enabled .Values.defaultRules.rules.windows }} +{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -24,7 +25,7 @@ spec: groups: - name: windows.pod.rules rules: - - expr: windows_container_available{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) + - expr: windows_container_available{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) record: windows_pod_container_available {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }} labels: @@ -35,7 +36,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: windows_container_cpu_usage_seconds_total{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) + - expr: windows_container_cpu_usage_seconds_total{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) record: windows_container_total_runtime {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }} labels: @@ -46,7 +47,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: windows_container_memory_usage_commit_bytes{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) + - expr: windows_container_memory_usage_commit_bytes{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) record: windows_container_memory_usage {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }} labels: @@ -57,7 +58,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: windows_container_memory_usage_private_working_set_bytes{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) + - expr: windows_container_memory_usage_private_working_set_bytes{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) record: windows_container_private_working_set_usage {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }} labels: @@ -68,7 +69,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: windows_container_network_receive_bytes_total{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) + - expr: windows_container_network_receive_bytes_total{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) record: windows_container_network_received_bytes_total {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }} labels: @@ -79,7 +80,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: windows_container_network_transmit_bytes_total{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="kube-state-metrics", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) + - expr: windows_container_network_transmit_bytes_total{job="windows-exporter", container_id != ""} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container_id, cluster) group_left(container, pod, namespace) max(kube_pod_container_info{job="{{ $kubeStateMetricsJob }}", container_id != ""}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container, container_id, pod, namespace, cluster) record: windows_container_network_transmitted_bytes_total {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }} labels: @@ -92,7 +93,7 @@ spec: {{- end }} - expr: |- max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, container) ( - kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + kube_pod_container_resource_requests{resource="memory",job="{{ $kubeStateMetricsJob }}"} ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container,pod,namespace,cluster) (windows_pod_container_available) record: kube_pod_windows_container_resource_memory_request {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }} @@ -104,7 +105,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container,pod,namespace,cluster) (windows_pod_container_available) + - expr: kube_pod_container_resource_limits{resource="memory",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container,pod,namespace,cluster) (windows_pod_container_available) record: kube_pod_windows_container_resource_memory_limit {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }} labels: @@ -117,7 +118,7 @@ spec: {{- end }} - expr: |- max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, container) ( - kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + kube_pod_container_resource_requests{resource="cpu",job="{{ $kubeStateMetricsJob }}"} ) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container,pod,namespace,cluster) (windows_pod_container_available) record: kube_pod_windows_container_resource_cpu_cores_request {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }} @@ -129,7 +130,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container,pod,namespace,cluster) (windows_pod_container_available) + - expr: kube_pod_container_resource_limits{resource="cpu",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}container,pod,namespace,cluster) (windows_pod_container_available) record: kube_pod_windows_container_resource_cpu_cores_limit {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.windows }} labels: