Skip to content

Commit

Permalink
[kube-prometheus-stack] fix recording rules
Browse files Browse the repository at this point in the history
Signed-off-by: frezes <[email protected]>
  • Loading branch information
frezes committed Dec 26, 2023
1 parent 2a92ab6 commit 90c4fd6
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 32 deletions.
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 52.1.4
version: 52.1.5
appVersion: v0.68.0
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ spec:
groups:
- name: whizard-telemetry-namespace.rules
rules:
- expr: sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
- expr: sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_cpu_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand All @@ -35,7 +35,7 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
- expr: sum by(cluster,namespace,workload,workload_type)(container_memory_usage_bytes{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on (cluster, namespace, pod) group_left (node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}))* on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_memory_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand All @@ -46,7 +46,7 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
- expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_memory_wo_cache_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand All @@ -57,7 +57,7 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
- expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_net_bytes_received:sum_irate
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand All @@ -68,7 +68,7 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
- expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_net_bytes_transmitted:sum_irate
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand All @@ -79,9 +79,9 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "deamonset", "(.*)")
- expr: label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "daemonset", "(.*)")
labels:
workload_type: deamonset
workload_type: daemonset
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
Expand All @@ -90,7 +90,7 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: namespace:workload_unavalibled_replicas:ratio
record: namespace:workload_unavailable_replicas:ratio
- expr: label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, cluster) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, cluster), "workload", "$1", "deployment", "(.*)")
labels:
workload_type: deployment
Expand All @@ -102,7 +102,7 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: namespace:workload_unavalibled_replicas:ratio
record: namespace:workload_unavailable_replicas:ratio
- expr: label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, cluster) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, cluster), "workload", "$1", "statefulset", "(.*)")
labels:
workload_type: statefulset
Expand All @@ -114,5 +114,5 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: namespace:workload_unavalibled_replicas:ratio
record: namespace:workload_unavailable_replicas:ratio
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: node:node_memory_bytes_total:sum - node:node_memory_bytes_used_total:sum
record: node:node_memory_bytes_available:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, node, instance, host_ip)(node_memory_MemTotal_bytes{job="node-exporter"} -(node_memory_MemAvailable_bytes{job="node-exporter"} or (node_memory_Buffers_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"}))) * on (cluster,node) group_left(role) ((kube_node_role{role="worker"} unless ignoring (role) kube_node_role{role="control-plane"}) or kube_node_role{role="control-plane"})
record: node:node_memory_bytes_used_total:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
expr: |||
node:node_memory_bytes_total:sum - node:node_memory_bytes_used_total:sum
||| % $._config,
}
},
{
record: 'node:node_memory_bytes_used_total:sum',
expr: |||
Expand Down Expand Up @@ -207,44 +207,44 @@
{
record: 'namespace:workload_cpu_usage:sum',
expr: |||
sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
||| % $._config,
},
{
record: 'namespace:workload_memory_usage:sum',
expr: |||
sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
sum by(cluster,namespace,workload,workload_type)(container_memory_usage_bytes{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on (cluster, namespace, pod) group_left (node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}))* on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
||| % $._config,
},
{
record: 'namespace:workload_memory_wo_cache_usage:sum',
expr: |||
sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
||| % $._config,
},
{
record: 'namespace:workload_net_bytes_received:sum_irate',
expr: |||
sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
||| % $._config,
},
{
record: 'namespace:workload_net_bytes_transmitted:sum_irate',
expr: |||
sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
||| % $._config,
},
{
record: 'namespace:workload_unavalibled_replicas:ratio',
record: 'namespace:workload_unavailable_replicas:ratio',
expr: |||
label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, %(clusterLabel)s) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,%(clusterLabel)s), "workload", "$1", "deamonset", "(.*)")
label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, %(clusterLabel)s) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,%(clusterLabel)s), "workload", "$1", "daemonset", "(.*)")
||| % $._config,
labels: {
workload_type: 'deamonset',
workload_type: 'daemonset',
},
},
{
record: 'namespace:workload_unavalibled_replicas:ratio',
record: 'namespace:workload_unavailable_replicas:ratio',
expr: |||
label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, %(clusterLabel)s) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, %(clusterLabel)s), "workload", "$1", "deployment", "(.*)")
||| % $._config,
Expand All @@ -253,7 +253,7 @@
},
},
{
record: 'namespace:workload_unavalibled_replicas:ratio',
record: 'namespace:workload_unavailable_replicas:ratio',
expr: |||
label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, %(clusterLabel)s) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, %(clusterLabel)s), "workload", "$1", "statefulset", "(.*)")
||| % $._config,
Expand Down
23 changes: 13 additions & 10 deletions ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ spec:
- expr: |
node:node_memory_bytes_used_total:sum / node:node_memory_bytes_total:sum
record: node:node_memory_utilisation:ratio
- expr: |
node:node_memory_bytes_total:sum - node:node_memory_bytes_used_total:sum
record: node:node_memory_bytes_available:sum
- expr: |
sum by (cluster, node, instance, host_ip)(node_memory_MemTotal_bytes{job="node-exporter"} -(node_memory_MemAvailable_bytes{job="node-exporter"} or (node_memory_Buffers_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"}))) * on (cluster,node) group_left(role) ((kube_node_role{role="worker"} unless ignoring (role) kube_node_role{role="control-plane"}) or kube_node_role{role="control-plane"})
record: node:node_memory_bytes_used_total:sum
Expand Down Expand Up @@ -120,35 +123,35 @@ spec:
- name: whizard-telemetry-namespace.rules
rules:
- expr: |
sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_cpu_usage:sum
- expr: |
sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
sum by(cluster,namespace,workload,workload_type)(container_memory_usage_bytes{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on (cluster, namespace, pod) group_left (node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}))* on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_memory_usage:sum
- expr: |
sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_memory_wo_cache_usage:sum
- expr: |
sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_net_bytes_received:sum_irate
- expr: |
sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_net_bytes_transmitted:sum_irate
- expr: |
label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "deamonset", "(.*)")
label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "daemonset", "(.*)")
labels:
workload_type: deamonset
record: namespace:workload_unavalibled_replicas:ratio
workload_type: daemonset
record: namespace:workload_unavailable_replicas:ratio
- expr: |
label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, cluster) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, cluster), "workload", "$1", "deployment", "(.*)")
labels:
workload_type: deployment
record: namespace:workload_unavalibled_replicas:ratio
record: namespace:workload_unavailable_replicas:ratio
- expr: |
label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, cluster) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, cluster), "workload", "$1", "statefulset", "(.*)")
labels:
workload_type: statefulset
record: namespace:workload_unavalibled_replicas:ratio
record: namespace:workload_unavailable_replicas:ratio
- name: whizard-telemetry-apiserver.rules
rules:
- expr: |
Expand Down

0 comments on commit 90c4fd6

Please sign in to comment.