Skip to content

Commit

Permalink
Merge pull request #48 from junotx/rules
Browse files Browse the repository at this point in the history
fix scheduler scheduling latency rules to compatible with new metrics
  • Loading branch information
benjaminhuo authored Mar 1, 2024
2 parents 91ccfb4 + 63ac9ef commit b36b395
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 1 deletion.
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 52.1.13
version: 52.1.14
appVersion: v0.68.0
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down
1 change: 1 addition & 0 deletions charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def new_representer(dumper, data):
'whizard-telemetry-namespace.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-node.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-etcd.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-kube-scheduler.rules': ' .Values.defaultRules.rules.whizardTelemetry',
}

alert_condition_map = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ rules:
- "whizard-telemetry-namespace.rules"
- "whizard-telemetry-node.rules"
- "whizard-telemetry-etcd.rules"
- "whizard-telemetry-kube-scheduler.rules"
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
{{- /*
Generated from 'whizard-telemetry-kube-scheduler.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.whizardTelemetry }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-kube-scheduler.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: whizard-telemetry-kube-scheduler.rules
rules:
- expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: '0.99'
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: '0.99'
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile
- expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: '0.9'
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: '0.9'
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile
- expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: '0.5'
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: '0.5'
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile
- expr: sum by(cluster) (rate(scheduler_e2e_scheduling_duration_seconds_sum{job="kube-scheduler"}[5m])) / sum by(cluster) (rate(scheduler_e2e_scheduling_duration_seconds_count{job="kube-scheduler"}[5m]))
record: cluster:scheduler_e2e_scheduling_duration_seconds:avg
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by(cluster) (rate(scheduler_scheduling_attempt_duration_seconds_sum{job="kube-scheduler"}[5m])) / sum by(cluster) (rate(scheduler_scheduling_attempt_duration_seconds_count{job="kube-scheduler"}[5m]))
record: cluster:scheduler_scheduling_attempt_duration_seconds:avg
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
podLabel: 'pod',
etcd_selector: 'job=~".*etcd.*"',
etcd_instance_labels: 'instance',
kubeSchedulerSelector: 'job="kube-scheduler"',
},

prometheusRules+:: {
Expand Down Expand Up @@ -657,6 +658,36 @@

],
},
{
name: "whizard-telemetry-kube-scheduler.rules",
rules: [
{
record: 'cluster_quantile:%s:histogram_quantile' % metric,
expr: |||
histogram_quantile(%(quantile)s, sum(rate(%(metric)s_bucket{%(kubeSchedulerSelector)s}[5m])) without(instance, %(podLabel)s, result))
||| % ({ quantile: quantile, metric: metric } + $._config),
labels: {
quantile: quantile,
},
}
for quantile in ['0.99', '0.9', '0.5']
for metric in [
'scheduler_e2e_scheduling_duration_seconds',
'scheduler_scheduling_attempt_duration_seconds',
]
] + [
{
record: 'cluster:%s:avg' % metric,
expr: |||
sum by(cluster) (rate(%(metric)s_sum{%(kubeSchedulerSelector)s}[5m])) / sum by(cluster) (rate(%(metric)s_count{%(kubeSchedulerSelector)s}[5m]))
||| % ({ metric: metric } + $._config),
}
for metric in [
'scheduler_e2e_scheduling_duration_seconds',
'scheduler_scheduling_attempt_duration_seconds',
]
],
}
],
},
}
38 changes: 38 additions & 0 deletions ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -428,3 +428,41 @@ spec:
labels:
quantile: "0.5"
record: etcd:etcd_disk_backend_commit_duration:histogram_quantile
- name: whizard-telemetry-kube-scheduler.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod, result))
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile
- expr: |
sum by(cluster) (rate(scheduler_e2e_scheduling_duration_seconds_sum{job="kube-scheduler"}[5m])) / sum by(cluster) (rate(scheduler_e2e_scheduling_duration_seconds_count{job="kube-scheduler"}[5m]))
record: cluster:scheduler_e2e_scheduling_duration_seconds:avg
- expr: |
sum by(cluster) (rate(scheduler_scheduling_attempt_duration_seconds_sum{job="kube-scheduler"}[5m])) / sum by(cluster) (rate(scheduler_scheduling_attempt_duration_seconds_count{job="kube-scheduler"}[5m]))
record: cluster:scheduler_scheduling_attempt_duration_seconds:avg

0 comments on commit b36b395

Please sign in to comment.