diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index 87ca9fc94fff..0eae66ac2974 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -21,7 +21,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 52.1.13 +version: 52.1.14 appVersion: v0.68.0 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus diff --git a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py index 7bffd088343d..dcb0c2d4e84c 100755 --- a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py +++ b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py @@ -118,6 +118,7 @@ def new_representer(dumper, data): 'whizard-telemetry-namespace.rules': ' .Values.defaultRules.rules.whizardTelemetry', 'whizard-telemetry-node.rules': ' .Values.defaultRules.rules.whizardTelemetry', 'whizard-telemetry-etcd.rules': ' .Values.defaultRules.rules.whizardTelemetry', + 'whizard-telemetry-kube-scheduler.rules': ' .Values.defaultRules.rules.whizardTelemetry', } alert_condition_map = { diff --git a/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl b/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl index c2bd1dccb0c3..0fe60526742c 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl +++ b/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl @@ -19,4 +19,5 @@ rules: - "whizard-telemetry-namespace.rules" - "whizard-telemetry-node.rules" - "whizard-telemetry-etcd.rules" + - "whizard-telemetry-kube-scheduler.rules" {{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-kube-scheduler.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-kube-scheduler.rules.yaml new file mode 100644 index 000000000000..c08876d80e6f --- /dev/null +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-kube-scheduler.rules.yaml @@ -0,0 +1,121 @@ +{{- /* +Generated from 'whizard-telemetry-kube-scheduler.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.whizardTelemetry }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-kube-scheduler.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "kube-prometheus-stack.namespace" . }} + labels: + app: {{ template "kube-prometheus-stack.name" . }} +{{ include "kube-prometheus-stack.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: whizard-telemetry-kube-scheduler.rules + rules: + - expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: '0.99' + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: '0.99' + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: '0.9' + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: '0.9' + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: '0.5' + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: '0.5' + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile + - expr: sum by(cluster) (rate(scheduler_e2e_scheduling_duration_seconds_sum{job="kube-scheduler"}[5m])) / sum by(cluster) (rate(scheduler_e2e_scheduling_duration_seconds_count{job="kube-scheduler"}[5m])) + record: cluster:scheduler_e2e_scheduling_duration_seconds:avg + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: sum by(cluster) (rate(scheduler_scheduling_attempt_duration_seconds_sum{job="kube-scheduler"}[5m])) / sum by(cluster) (rate(scheduler_scheduling_attempt_duration_seconds_count{job="kube-scheduler"}[5m])) + record: cluster:scheduler_scheduling_attempt_duration_seconds:avg + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet b/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet index 6239acce1ceb..e7207c2fe240 100644 --- a/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet +++ b/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet @@ -11,6 +11,7 @@ podLabel: 'pod', etcd_selector: 'job=~".*etcd.*"', etcd_instance_labels: 'instance', + kubeSchedulerSelector: 'job="kube-scheduler"', }, prometheusRules+:: { @@ -657,6 +658,36 @@ ], }, + { + name: "whizard-telemetry-kube-scheduler.rules", + rules: [ + { + record: 'cluster_quantile:%s:histogram_quantile' % metric, + expr: ||| + histogram_quantile(%(quantile)s, sum(rate(%(metric)s_bucket{%(kubeSchedulerSelector)s}[5m])) without(instance, %(podLabel)s, result)) + ||| % ({ quantile: quantile, metric: metric } + $._config), + labels: { + quantile: quantile, + }, + } + for quantile in ['0.99', '0.9', '0.5'] + for metric in [ + 'scheduler_e2e_scheduling_duration_seconds', + 'scheduler_scheduling_attempt_duration_seconds', + ] + ] + [ + { + record: 'cluster:%s:avg' % metric, + expr: ||| + sum by(cluster) (rate(%(metric)s_sum{%(kubeSchedulerSelector)s}[5m])) / sum by(cluster) (rate(%(metric)s_count{%(kubeSchedulerSelector)s}[5m])) + ||| % ({ metric: metric } + $._config), + } + for metric in [ + 'scheduler_e2e_scheduling_duration_seconds', + 'scheduler_scheduling_attempt_duration_seconds', + ] + ], + } ], }, } diff --git a/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml b/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml index ee67afc78ef9..50693cf6f27c 100644 --- a/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml +++ b/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml @@ -428,3 +428,41 @@ spec: labels: quantile: "0.5" record: etcd:etcd_disk_backend_commit_duration:histogram_quantile + - name: whizard-telemetry-kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile + - expr: | + sum by(cluster) (rate(scheduler_e2e_scheduling_duration_seconds_sum{job="kube-scheduler"}[5m])) / sum by(cluster) (rate(scheduler_e2e_scheduling_duration_seconds_count{job="kube-scheduler"}[5m])) + record: cluster:scheduler_e2e_scheduling_duration_seconds:avg + - expr: | + sum by(cluster) (rate(scheduler_scheduling_attempt_duration_seconds_sum{job="kube-scheduler"}[5m])) / sum by(cluster) (rate(scheduler_scheduling_attempt_duration_seconds_count{job="kube-scheduler"}[5m])) + record: cluster:scheduler_scheduling_attempt_duration_seconds:avg