Skip to content

Commit

Permalink
[kube-prometheus-stack] Configure additional aggregation labels + k8s…
Browse files Browse the repository at this point in the history
….rules split (#3883)

* [kube-prometheus-stack] Configure additional aggregation labels

Signed-off-by: Jan-Otto Kröpke <[email protected]>

* Re-run sync

Signed-off-by: Jan-Otto Kröpke <[email protected]>

* Re-run sync

Signed-off-by: Jan-Otto Kröpke <[email protected]>

* Add new values for rules

Signed-off-by: Jan-Otto Kröpke <[email protected]>

* Add new values for rules

Signed-off-by: Jan-Otto Kröpke <[email protected]>

* Add new values for rules

Signed-off-by: Jan-Otto Kröpke <[email protected]>

* [kube-prometheus-stack] Fix names

Signed-off-by: Jan-Otto Kröpke <[email protected]>

* bump version

Signed-off-by: Jan-Otto Kröpke <[email protected]>

---------

Signed-off-by: Jan-Otto Kröpke <[email protected]>
Signed-off-by: Jan-Otto Kröpke <[email protected]>
Signed-off-by: Jan-Otto Kröpke <[email protected]>
Co-authored-by: Gabriel Martinez <[email protected]>
  • Loading branch information
jkroepke and GMartinez-Sisti authored Nov 22, 2023
1 parent 49e3c3a commit 4c10243
Show file tree
Hide file tree
Showing 27 changed files with 700 additions and 501 deletions.
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 54.1.0
version: 54.2.0
appVersion: v0.69.1
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down
20 changes: 17 additions & 3 deletions charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,13 @@ def new_representer(dumper, data):
'config-reloaders': ' .Values.defaultRules.rules.configReloaders',
'etcd': ' .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd',
'general.rules': ' .Values.defaultRules.rules.general',
'k8s.rules': ' .Values.defaultRules.rules.k8s',
'k8s.rules.container_cpu_usage_seconds_total': ' .Values.defaultRules.rules.k8sContainerCpuUsageSecondsTotal',
'k8s.rules.container_memory_cache': ' .Values.defaultRules.rules.k8sContainerMemoryCache',
'k8s.rules.container_memory_rss': ' .Values.defaultRules.rules.k8sContainerMemoryRss',
'k8s.rules.container_memory_swap': ' .Values.defaultRules.rules.k8sContainerMemorySwap',
'k8s.rules.container_memory_working_set_bytes': ' .Values.defaultRules.rules.k8sContainerMemoryWorkingSetBytes',
'k8s.rules.container_resource': ' .Values.defaultRules.rules.k8sContainerResource',
'k8s.rules.pod_owner': ' .Values.defaultRules.rules.k8sPodOwner',
'kube-apiserver-availability.rules': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverAvailability',
'kube-apiserver-burnrate.rules': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverBurnrate',
'kube-apiserver-histogram.rules': ' .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverHistogram',
Expand Down Expand Up @@ -417,7 +423,7 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes)
rules = add_rules_per_rule_conditions(rules, group)
# initialize header
lines = header % {
'name': group['name'],
'name': sanitize_name(group['name']),
'url': url,
'condition': condition_map.get(group['name'], ''),
'init_line': init_line,
Expand All @@ -426,7 +432,11 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes)
}

# rules themselves
lines += rules
lines += re.sub(
r'\s(by|on) ?\(',
r' \1 ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}',
rules
)

# footer
lines += '{{- end }}'
Expand Down Expand Up @@ -541,6 +551,10 @@ def main():
print("Finished")


def sanitize_name(name):
return re.sub('[_]', '-', name).lower()


def jsonnet_import_callback(base, rel):
if "github.com" in base:
base = os.getcwd() + '/vendor/' + base[base.find('github.com'):]
Expand Down
8 changes: 7 additions & 1 deletion charts/kube-prometheus-stack/templates/prometheus/_rules.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@ rules:
- "config-reloaders"
- "etcd"
- "general.rules"
- "k8s.rules"
- "k8s.rules.container_cpu_usage_seconds_total"
- "k8s.rules.container_memory_cache"
- "k8s.rules.container_memory_rss"
- "k8s.rules.container_memory_swap"
- "k8s.rules.container_memory_working_set_bytes"
- "k8s.rules.container_resource"
- "k8s.rules.pod_owner"
- "kube-apiserver-availability.rules"
- "kube-apiserver-burnrate.rules"
- "kube-apiserver-histogram.rules"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ spec:
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])
< on (namespace,service,cluster) group_left
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
< on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) group_left
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
Expand Down Expand Up @@ -137,7 +137,7 @@ spec:
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |-
min by (namespace,service, integration) (
min by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m])
Expand Down Expand Up @@ -171,7 +171,7 @@ spec:
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |-
min by (namespace,service, integration) (
min by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m])
Expand Down Expand Up @@ -205,8 +205,8 @@ spec:
runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |-
count by (namespace,service,cluster) (
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (
count_values by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
)
!= 1
for: 20m
Expand Down Expand Up @@ -238,11 +238,11 @@ spec:
summary: Half or more of the Alertmanager instances within the same cluster are down.
expr: |-
(
count by (namespace,service,cluster) (
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (
avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5
)
/
count by (namespace,service,cluster) (
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
)
)
Expand Down Expand Up @@ -276,11 +276,11 @@ spec:
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
expr: |-
(
count by (namespace,service,cluster) (
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (
changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4
)
/
count by (namespace,service,cluster) (
count by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace,service,cluster) (
up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ spec:
'
runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/infoinhibitor
summary: Info-level alert inhibition.
expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
expr: ALERTS{severity = "info"} == 1 unless on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: none
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{{- /*
Generated from 'k8s.rules.container-cpu-usage-seconds-total' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerCpuUsageSecondsTotal }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-cpu-usage-seconds-total" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: k8s.rules.container_cpu_usage_seconds_total
rules:
- expr: |-
sum by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, container) (
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (
1, max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerCpuUsageSecondsTotal }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerCpuUsageSecondsTotal }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{{- /*
Generated from 'k8s.rules.container-memory-cache' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemoryCache }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-cache" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: k8s.rules.container_memory_cache
rules:
- expr: |-
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1,
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_cache
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryCache }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryCache }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{{- /*
Generated from 'k8s.rules.container-memory-rss' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemoryRss }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-rss" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: k8s.rules.container_memory_rss
rules:
- expr: |-
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1,
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_rss
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryRss }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemoryRss }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{{- /*
Generated from 'k8s.rules.container-memory-swap' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8sContainerMemorySwap }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules.container-memory-swap" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: k8s.rules.container_memory_swap
rules:
- expr: |-
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) group_left(node) topk by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod) (1,
max by ({{ range $.Values.defaultRules.additionalAggregationLabels }}{{ . }},{{ end }}cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_swap
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemorySwap }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.k8sContainerMemorySwap }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
{{- end }}
Loading

0 comments on commit 4c10243

Please sign in to comment.