From a7e9cbaa62f6a6f828fcbc8afd5e169393e6ef07 Mon Sep 17 00:00:00 2001 From: frezes Date: Wed, 6 Mar 2024 10:27:41 +0800 Subject: [PATCH] [kube-prometheus-stack] fix missing unscheduled Pods Signed-off-by: frezes --- charts/kube-prometheus-stack/Chart.yaml | 2 +- .../whizard-telemetry-cluster.rules.yaml | 64 +++++++++++----- .../rules/custom.libsonnet | 76 +++++++++++++------ .../whizard-telemetry-prometheusRule.yaml | 64 +++++++++++----- 4 files changed, 145 insertions(+), 61 deletions(-) diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index 18abd8988914..77f0a7963e9c 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -21,7 +21,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 52.1.16 +version: 52.1.17 appVersion: v0.68.0 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-cluster.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-cluster.rules.yaml index c9e268a328c2..0663310509bb 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-cluster.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-cluster.rules.yaml @@ -26,7 +26,51 @@ spec: rules: - expr: |- max by (cluster, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) ( - kube_pod_info{job="kube-state-metrics"} + kube_pod_info{job="kube-state-metrics",node!=""} + * on (cluster, namespace) group_left (workspace) + max by (cluster, namespace, workspace) (kube_namespace_labels{job="kube-state-metrics"}) + * on (cluster, namespace, pod) group_left (workload, workload_type) + max by (cluster, namespace, pod, workload, workload_type) ( + label_join( + label_join( + kube_pod_owner{job="kube-state-metrics",owner_kind!~"ReplicaSet|DaemonSet|StatefulSet|Job"}, + "workload", + "$1", + "owner_name" + ), + "workload_type", + "$1", + "owner_kind" + ) + or + kube_pod_owner{job="kube-state-metrics",owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"} + * on (cluster, namespace, pod) group_left (workload_type, workload) + namespace_workload_pod:kube_pod_owner:relabel + ) + * on (cluster, namespace, pod) group_left (qos_class) + max by (cluster, namespace, pod, qos_class) ( + kube_pod_status_qos_class{job="kube-state-metrics"} > 0 + ) + * on (cluster, node) group_left (role) + max by (cluster, node, role) ( + kube_node_info{job="kube-state-metrics"} + * on (cluster, node) group_left (role) + max by (cluster, node, role) ( + ( + kube_node_role{job="kube-state-metrics",role="worker"} + unless ignoring (role) + kube_node_role{job="kube-state-metrics",role="control-plane"} + ) + or + kube_node_role{job="kube-state-metrics",role="control-plane"} + ) + or + kube_node_info{job="kube-state-metrics"} + unless on (cluster, node) + kube_node_role{job="kube-state-metrics"} + ) + or + kube_pod_info{job="kube-state-metrics",node=""} * on (cluster, namespace) group_left (workspace) max by (cluster, namespace, workspace) (kube_namespace_labels{job="kube-state-metrics"}) * on (cluster, namespace, pod) group_left (workload, workload_type) @@ -44,29 +88,13 @@ spec: ) or kube_pod_owner{job="kube-state-metrics",owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"} - * on (namespace, pod) group_left (workload_type, workload) + * on (cluster, namespace, pod) group_left (workload_type, workload) namespace_workload_pod:kube_pod_owner:relabel ) * on (cluster, namespace, pod) group_left (qos_class) max by (cluster, namespace, pod, qos_class) ( kube_pod_status_qos_class{job="kube-state-metrics"} > 0 ) - * on (cluster, node) group_left (role) - max by (cluster, node, role) ( - kube_node_info{job="kube-state-metrics"} - * on (cluster, node) group_left (role) - max by (cluster, node, role) ( - ( - kube_node_role{job="kube-state-metrics",role="worker"} - unless ignoring (role) - kube_node_role{job="kube-state-metrics",role="control-plane"} - ) - or - kube_node_role{job="kube-state-metrics",role="control-plane"} - ) - or - kube_node_info{job="kube-state-metrics"} unless on(cluster,node) kube_node_role{job="kube-state-metrics"} - ) ) record: 'workspace_workload_node:kube_pod_info:' {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} diff --git a/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet b/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet index e7207c2fe240..61e7e2022838 100644 --- a/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet +++ b/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet @@ -26,7 +26,51 @@ record: 'workspace_workload_node:kube_pod_info:', expr: ||| max by (%(clusterLabel)s, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) ( - kube_pod_info{%(kubeStateMetricsSelector)s} + kube_pod_info{%(kubeStateMetricsSelector)s,node!=""} + * on (%(clusterLabel)s, namespace) group_left (workspace) + max by (%(clusterLabel)s, namespace, workspace) (kube_namespace_labels{%(kubeStateMetricsSelector)s}) + * on (%(clusterLabel)s, namespace, pod) group_left (workload, workload_type) + max by (%(clusterLabel)s, namespace, pod, workload, workload_type) ( + label_join( + label_join( + kube_pod_owner{%(kubeStateMetricsSelector)s,owner_kind!~"ReplicaSet|DaemonSet|StatefulSet|Job"}, + "workload", + "$1", + "owner_name" + ), + "workload_type", + "$1", + "owner_kind" + ) + or + kube_pod_owner{%(kubeStateMetricsSelector)s,owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"} + * on (%(clusterLabel)s, namespace, pod) group_left (workload_type, workload) + namespace_workload_pod:kube_pod_owner:relabel + ) + * on (%(clusterLabel)s, namespace, pod) group_left (qos_class) + max by (%(clusterLabel)s, namespace, pod, qos_class) ( + kube_pod_status_qos_class{%(kubeStateMetricsSelector)s} > 0 + ) + * on (%(clusterLabel)s, node) group_left (role) + max by (%(clusterLabel)s, node, role) ( + kube_node_info{%(kubeStateMetricsSelector)s} + * on (%(clusterLabel)s, node) group_left (role) + max by (%(clusterLabel)s, node, role) ( + ( + kube_node_role{%(kubeStateMetricsSelector)s,role="worker"} + unless ignoring (role) + kube_node_role{%(kubeStateMetricsSelector)s,role="control-plane"} + ) + or + kube_node_role{%(kubeStateMetricsSelector)s,role="control-plane"} + ) + or + kube_node_info{%(kubeStateMetricsSelector)s} + unless on (%(clusterLabel)s, node) + kube_node_role{%(kubeStateMetricsSelector)s} + ) + or + kube_pod_info{%(kubeStateMetricsSelector)s,node=""} * on (%(clusterLabel)s, namespace) group_left (workspace) max by (%(clusterLabel)s, namespace, workspace) (kube_namespace_labels{%(kubeStateMetricsSelector)s}) * on (%(clusterLabel)s, namespace, pod) group_left (workload, workload_type) @@ -44,29 +88,13 @@ ) or kube_pod_owner{%(kubeStateMetricsSelector)s,owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"} - * on (namespace, pod) group_left (workload_type, workload) + * on (%(clusterLabel)s, namespace, pod) group_left (workload_type, workload) namespace_workload_pod:kube_pod_owner:relabel ) * on (%(clusterLabel)s, namespace, pod) group_left (qos_class) max by (%(clusterLabel)s, namespace, pod, qos_class) ( kube_pod_status_qos_class{%(kubeStateMetricsSelector)s} > 0 ) - * on (%(clusterLabel)s, node) group_left (role) - max by (%(clusterLabel)s, node, role) ( - kube_node_info{%(kubeStateMetricsSelector)s} - * on (%(clusterLabel)s, node) group_left (role) - max by (%(clusterLabel)s, node, role) ( - ( - kube_node_role{%(kubeStateMetricsSelector)s,role="worker"} - unless ignoring (role) - kube_node_role{%(kubeStateMetricsSelector)s,role="control-plane"} - ) - or - kube_node_role{%(kubeStateMetricsSelector)s,role="control-plane"} - ) - or - kube_node_info{%(kubeStateMetricsSelector)s} unless on(%(clusterLabel)s,node) kube_node_role{%(kubeStateMetricsSelector)s} - ) ) ||| % $._config }, @@ -238,7 +266,7 @@ { record: 'node:node_pod_quota:sum', expr: ||| - sum by (%(clusterLabel)s, node) (kube_node_status_allocatable{job="kube-state-metrics",resource="pods"}) + sum by (%(clusterLabel)s, node) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="pods"}) ||| % $._config, }, { @@ -247,21 +275,21 @@ count by (%(clusterLabel)s, node) ( node_namespace_pod:kube_pod_info:{node!=""} unless on (%(podLabel)s, namespace, %(clusterLabel)s) - (kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"} > 0) + (kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Succeeded"} > 0) unless on (%(podLabel)s, namespace, %(clusterLabel)s) ( - (kube_pod_status_ready{condition="true",job="kube-state-metrics"} > 0) + (kube_pod_status_ready{condition="true",%(kubeStateMetricsSelector)s} > 0) and on (%(podLabel)s, namespace, %(clusterLabel)s) - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} > 0) + (kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Running"} > 0) ) unless on (%(clusterLabel)s, %(podLabel)s, namespace) - kube_pod_container_status_waiting_reason{job="kube-state-metrics",reason="ContainerCreating"} > 0 + kube_pod_container_status_waiting_reason{%(kubeStateMetricsSelector)s,reason="ContainerCreating"} > 0 ) / count by (%(clusterLabel)s, node) ( node_namespace_pod:kube_pod_info:{node!=""} unless on (%(podLabel)s, namespace, %(clusterLabel)s) - kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"} > 0 + kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Succeeded"} > 0 ) ||| % $._config, }, diff --git a/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml b/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml index 50693cf6f27c..65246d311136 100644 --- a/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml +++ b/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml @@ -14,7 +14,51 @@ spec: rules: - expr: | max by (cluster, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) ( - kube_pod_info{job="kube-state-metrics"} + kube_pod_info{job="kube-state-metrics",node!=""} + * on (cluster, namespace) group_left (workspace) + max by (cluster, namespace, workspace) (kube_namespace_labels{job="kube-state-metrics"}) + * on (cluster, namespace, pod) group_left (workload, workload_type) + max by (cluster, namespace, pod, workload, workload_type) ( + label_join( + label_join( + kube_pod_owner{job="kube-state-metrics",owner_kind!~"ReplicaSet|DaemonSet|StatefulSet|Job"}, + "workload", + "$1", + "owner_name" + ), + "workload_type", + "$1", + "owner_kind" + ) + or + kube_pod_owner{job="kube-state-metrics",owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"} + * on (cluster, namespace, pod) group_left (workload_type, workload) + namespace_workload_pod:kube_pod_owner:relabel + ) + * on (cluster, namespace, pod) group_left (qos_class) + max by (cluster, namespace, pod, qos_class) ( + kube_pod_status_qos_class{job="kube-state-metrics"} > 0 + ) + * on (cluster, node) group_left (role) + max by (cluster, node, role) ( + kube_node_info{job="kube-state-metrics"} + * on (cluster, node) group_left (role) + max by (cluster, node, role) ( + ( + kube_node_role{job="kube-state-metrics",role="worker"} + unless ignoring (role) + kube_node_role{job="kube-state-metrics",role="control-plane"} + ) + or + kube_node_role{job="kube-state-metrics",role="control-plane"} + ) + or + kube_node_info{job="kube-state-metrics"} + unless on (cluster, node) + kube_node_role{job="kube-state-metrics"} + ) + or + kube_pod_info{job="kube-state-metrics",node=""} * on (cluster, namespace) group_left (workspace) max by (cluster, namespace, workspace) (kube_namespace_labels{job="kube-state-metrics"}) * on (cluster, namespace, pod) group_left (workload, workload_type) @@ -32,29 +76,13 @@ spec: ) or kube_pod_owner{job="kube-state-metrics",owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"} - * on (namespace, pod) group_left (workload_type, workload) + * on (cluster, namespace, pod) group_left (workload_type, workload) namespace_workload_pod:kube_pod_owner:relabel ) * on (cluster, namespace, pod) group_left (qos_class) max by (cluster, namespace, pod, qos_class) ( kube_pod_status_qos_class{job="kube-state-metrics"} > 0 ) - * on (cluster, node) group_left (role) - max by (cluster, node, role) ( - kube_node_info{job="kube-state-metrics"} - * on (cluster, node) group_left (role) - max by (cluster, node, role) ( - ( - kube_node_role{job="kube-state-metrics",role="worker"} - unless ignoring (role) - kube_node_role{job="kube-state-metrics",role="control-plane"} - ) - or - kube_node_role{job="kube-state-metrics",role="control-plane"} - ) - or - kube_node_info{job="kube-state-metrics"} unless on(cluster,node) kube_node_role{job="kube-state-metrics"} - ) ) record: 'workspace_workload_node:kube_pod_info:' - name: whizard-telemetry-node.rules