Skip to content

Commit

Permalink
[kube-prometheus-stack] allow kube-state-metrics job name to be overr…
Browse files Browse the repository at this point in the history
…idden (#4160)
  • Loading branch information
jhandguy authored Jan 22, 2024
1 parent 932c901 commit 2f3dbdd
Show file tree
Hide file tree
Showing 12 changed files with 118 additions and 95 deletions.
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 56.0.1
version: 56.0.2
appVersion: v0.71.0
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down
5 changes: 4 additions & 1 deletion charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,10 @@ def new_representer(dumper, data):
'replacement': '$1',
'init': ''},
'job="kube-state-metrics"': {
'replacement': 'job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"',
'replacement': 'job="{{ $kubeStateMetricsJob }}"',
'init': '{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}'},
'job="{{ $kubeStateMetricsJob }}"': {
'replacement': 'job="{{ $kubeStateMetricsJob }}", namespace=~"{{ $targetNamespace }}"',
'limitGroup': ['kubernetes-apps'],
'init': '{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}'},
'job="kubelet"': {
Expand Down
11 changes: 11 additions & 0 deletions charts/kube-prometheus-stack/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,17 @@ Use the grafana namespace override for multi-namespace deployments in combined c
{{- end -}}
{{- end -}}

{{/*
Allow kube-state-metrics job name to be overridden
*/}}
{{- define "kube-prometheus-stack-kube-state-metrics.name" -}}
{{- if index .Values "kube-state-metrics" "nameOverride" -}}
{{- index .Values "kube-state-metrics" "nameOverride" -}}
{{- else -}}
{{- print "kube-state-metrics" -}}
{{- end -}}
{{- end -}}

{{/*
Use the kube-state-metrics namespace override for multi-namespace deployments in combined charts
*/}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerResource }}
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -25,7 +26,7 @@ spec:
- name: k8s.rules.container_resource
rules:
- expr: |-
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
kube_pod_container_resource_requests{resource="memory",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
Expand All @@ -43,7 +44,7 @@ spec:
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) (
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
kube_pod_container_resource_requests{resource="memory",job="{{ $kubeStateMetricsJob }}"}
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
Expand All @@ -60,7 +61,7 @@ spec:
{{- end }}
{{- end }}
- expr: |-
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
kube_pod_container_resource_requests{resource="cpu",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
Expand All @@ -78,7 +79,7 @@ spec:
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) (
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
kube_pod_container_resource_requests{resource="cpu",job="{{ $kubeStateMetricsJob }}"}
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
Expand All @@ -95,7 +96,7 @@ spec:
{{- end }}
{{- end }}
- expr: |-
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
kube_pod_container_resource_limits{resource="memory",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
Expand All @@ -113,7 +114,7 @@ spec:
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) (
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
kube_pod_container_resource_limits{resource="memory",job="{{ $kubeStateMetricsJob }}"}
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
Expand All @@ -130,7 +131,7 @@ spec:
{{- end }}
{{- end }}
- expr: |-
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
kube_pod_container_resource_limits{resource="cpu",job="{{ $kubeStateMetricsJob }}"} * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster)
group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
Expand All @@ -148,7 +149,7 @@ spec:
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, cluster) (
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
kube_pod_container_resource_limits{resource="cpu",job="{{ $kubeStateMetricsJob }}"}
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) group_left() max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sPodOwner }}
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -28,11 +29,11 @@ spec:
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) (
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace) group_left(owner_name) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace) (
1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}replicaset, namespace, owner_name) (
kube_replicaset_owner{job="kube-state-metrics"}
kube_replicaset_owner{job="{{ $kubeStateMetricsJob }}"}
)
),
"workload", "$1", "owner_name", "(.*)"
Expand All @@ -52,7 +53,7 @@ spec:
- expr: |-
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
Expand All @@ -70,7 +71,7 @@ spec:
- expr: |-
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
Expand All @@ -88,7 +89,7 @@ spec:
- expr: |-
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
kube_pod_owner{job="{{ $kubeStateMetricsJob }}", owner_kind="Job"},
"workload", "$1", "owner_name", "(.*)"
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubeStateMetrics }}
{{- $kubeStateMetricsJob := include "kube-prometheus-stack-kube-state-metrics.name" . }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand Down Expand Up @@ -37,9 +38,9 @@ spec:
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |-
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
(sum(rate(kube_state_metrics_list_total{job="{{ $kubeStateMetricsJob }}",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster))
sum(rate(kube_state_metrics_list_total{job="{{ $kubeStateMetricsJob }}"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster))
> 0.01
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
Expand Down Expand Up @@ -69,9 +70,9 @@ spec:
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |-
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
(sum(rate(kube_state_metrics_watch_total{job="{{ $kubeStateMetricsJob }}",result="error"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster))
sum(rate(kube_state_metrics_watch_total{job="{{ $kubeStateMetricsJob }}"}[5m])) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster))
> 0.01
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
Expand Down Expand Up @@ -100,7 +101,7 @@ spec:
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0
expr: stdvar (kube_state_metrics_total_shards{job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) != 0
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
Expand Down Expand Up @@ -129,9 +130,9 @@ spec:
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: |-
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - 1
2^max(kube_state_metrics_total_shards{job="{{ $kubeStateMetricsJob }}"}) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster) - 1
-
sum( 2 ^ max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
sum( 2 ^ max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="{{ $kubeStateMetricsJob }}"}) ) by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster)
!= 0
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
Expand Down
Loading

0 comments on commit 2f3dbdd

Please sign in to comment.