Skip to content

Commit

Permalink
[kube-prometheus-stack] fix missing unscheduled Pods
Browse files Browse the repository at this point in the history
Signed-off-by: frezes <[email protected]>
  • Loading branch information
frezes committed Mar 6, 2024
1 parent a37a570 commit a7e9cba
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 61 deletions.
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 52.1.16
version: 52.1.17
appVersion: v0.68.0
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,51 @@ spec:
rules:
- expr: |-
max by (cluster, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) (
kube_pod_info{job="kube-state-metrics"}
kube_pod_info{job="kube-state-metrics",node!=""}
* on (cluster, namespace) group_left (workspace)
max by (cluster, namespace, workspace) (kube_namespace_labels{job="kube-state-metrics"})
* on (cluster, namespace, pod) group_left (workload, workload_type)
max by (cluster, namespace, pod, workload, workload_type) (
label_join(
label_join(
kube_pod_owner{job="kube-state-metrics",owner_kind!~"ReplicaSet|DaemonSet|StatefulSet|Job"},
"workload",
"$1",
"owner_name"
),
"workload_type",
"$1",
"owner_kind"
)
or
kube_pod_owner{job="kube-state-metrics",owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"}
* on (cluster, namespace, pod) group_left (workload_type, workload)
namespace_workload_pod:kube_pod_owner:relabel
)
* on (cluster, namespace, pod) group_left (qos_class)
max by (cluster, namespace, pod, qos_class) (
kube_pod_status_qos_class{job="kube-state-metrics"} > 0
)
* on (cluster, node) group_left (role)
max by (cluster, node, role) (
kube_node_info{job="kube-state-metrics"}
* on (cluster, node) group_left (role)
max by (cluster, node, role) (
(
kube_node_role{job="kube-state-metrics",role="worker"}
unless ignoring (role)
kube_node_role{job="kube-state-metrics",role="control-plane"}
)
or
kube_node_role{job="kube-state-metrics",role="control-plane"}
)
or
kube_node_info{job="kube-state-metrics"}
unless on (cluster, node)
kube_node_role{job="kube-state-metrics"}
)
or
kube_pod_info{job="kube-state-metrics",node=""}
* on (cluster, namespace) group_left (workspace)
max by (cluster, namespace, workspace) (kube_namespace_labels{job="kube-state-metrics"})
* on (cluster, namespace, pod) group_left (workload, workload_type)
Expand All @@ -44,29 +88,13 @@ spec:
)
or
kube_pod_owner{job="kube-state-metrics",owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"}
* on (namespace, pod) group_left (workload_type, workload)
* on (cluster, namespace, pod) group_left (workload_type, workload)
namespace_workload_pod:kube_pod_owner:relabel
)
* on (cluster, namespace, pod) group_left (qos_class)
max by (cluster, namespace, pod, qos_class) (
kube_pod_status_qos_class{job="kube-state-metrics"} > 0
)
* on (cluster, node) group_left (role)
max by (cluster, node, role) (
kube_node_info{job="kube-state-metrics"}
* on (cluster, node) group_left (role)
max by (cluster, node, role) (
(
kube_node_role{job="kube-state-metrics",role="worker"}
unless ignoring (role)
kube_node_role{job="kube-state-metrics",role="control-plane"}
)
or
kube_node_role{job="kube-state-metrics",role="control-plane"}
)
or
kube_node_info{job="kube-state-metrics"} unless on(cluster,node) kube_node_role{job="kube-state-metrics"}
)
)
record: 'workspace_workload_node:kube_pod_info:'
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,51 @@
record: 'workspace_workload_node:kube_pod_info:',
expr: |||
max by (%(clusterLabel)s, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) (
kube_pod_info{%(kubeStateMetricsSelector)s}
kube_pod_info{%(kubeStateMetricsSelector)s,node!=""}
* on (%(clusterLabel)s, namespace) group_left (workspace)
max by (%(clusterLabel)s, namespace, workspace) (kube_namespace_labels{%(kubeStateMetricsSelector)s})
* on (%(clusterLabel)s, namespace, pod) group_left (workload, workload_type)
max by (%(clusterLabel)s, namespace, pod, workload, workload_type) (
label_join(
label_join(
kube_pod_owner{%(kubeStateMetricsSelector)s,owner_kind!~"ReplicaSet|DaemonSet|StatefulSet|Job"},
"workload",
"$1",
"owner_name"
),
"workload_type",
"$1",
"owner_kind"
)
or
kube_pod_owner{%(kubeStateMetricsSelector)s,owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"}
* on (%(clusterLabel)s, namespace, pod) group_left (workload_type, workload)
namespace_workload_pod:kube_pod_owner:relabel
)
* on (%(clusterLabel)s, namespace, pod) group_left (qos_class)
max by (%(clusterLabel)s, namespace, pod, qos_class) (
kube_pod_status_qos_class{%(kubeStateMetricsSelector)s} > 0
)
* on (%(clusterLabel)s, node) group_left (role)
max by (%(clusterLabel)s, node, role) (
kube_node_info{%(kubeStateMetricsSelector)s}
* on (%(clusterLabel)s, node) group_left (role)
max by (%(clusterLabel)s, node, role) (
(
kube_node_role{%(kubeStateMetricsSelector)s,role="worker"}
unless ignoring (role)
kube_node_role{%(kubeStateMetricsSelector)s,role="control-plane"}
)
or
kube_node_role{%(kubeStateMetricsSelector)s,role="control-plane"}
)
or
kube_node_info{%(kubeStateMetricsSelector)s}
unless on (%(clusterLabel)s, node)
kube_node_role{%(kubeStateMetricsSelector)s}
)
or
kube_pod_info{%(kubeStateMetricsSelector)s,node=""}
* on (%(clusterLabel)s, namespace) group_left (workspace)
max by (%(clusterLabel)s, namespace, workspace) (kube_namespace_labels{%(kubeStateMetricsSelector)s})
* on (%(clusterLabel)s, namespace, pod) group_left (workload, workload_type)
Expand All @@ -44,29 +88,13 @@
)
or
kube_pod_owner{%(kubeStateMetricsSelector)s,owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"}
* on (namespace, pod) group_left (workload_type, workload)
* on (%(clusterLabel)s, namespace, pod) group_left (workload_type, workload)
namespace_workload_pod:kube_pod_owner:relabel
)
* on (%(clusterLabel)s, namespace, pod) group_left (qos_class)
max by (%(clusterLabel)s, namespace, pod, qos_class) (
kube_pod_status_qos_class{%(kubeStateMetricsSelector)s} > 0
)
* on (%(clusterLabel)s, node) group_left (role)
max by (%(clusterLabel)s, node, role) (
kube_node_info{%(kubeStateMetricsSelector)s}
* on (%(clusterLabel)s, node) group_left (role)
max by (%(clusterLabel)s, node, role) (
(
kube_node_role{%(kubeStateMetricsSelector)s,role="worker"}
unless ignoring (role)
kube_node_role{%(kubeStateMetricsSelector)s,role="control-plane"}
)
or
kube_node_role{%(kubeStateMetricsSelector)s,role="control-plane"}
)
or
kube_node_info{%(kubeStateMetricsSelector)s} unless on(%(clusterLabel)s,node) kube_node_role{%(kubeStateMetricsSelector)s}
)
)
||| % $._config
},
Expand Down Expand Up @@ -238,7 +266,7 @@
{
record: 'node:node_pod_quota:sum',
expr: |||
sum by (%(clusterLabel)s, node) (kube_node_status_allocatable{job="kube-state-metrics",resource="pods"})
sum by (%(clusterLabel)s, node) (kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="pods"})
||| % $._config,
},
{
Expand All @@ -247,21 +275,21 @@
count by (%(clusterLabel)s, node) (
node_namespace_pod:kube_pod_info:{node!=""}
unless on (%(podLabel)s, namespace, %(clusterLabel)s)
(kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"} > 0)
(kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Succeeded"} > 0)
unless on (%(podLabel)s, namespace, %(clusterLabel)s)
(
(kube_pod_status_ready{condition="true",job="kube-state-metrics"} > 0)
(kube_pod_status_ready{condition="true",%(kubeStateMetricsSelector)s} > 0)
and on (%(podLabel)s, namespace, %(clusterLabel)s)
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} > 0)
(kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Running"} > 0)
)
unless on (%(clusterLabel)s, %(podLabel)s, namespace)
kube_pod_container_status_waiting_reason{job="kube-state-metrics",reason="ContainerCreating"} > 0
kube_pod_container_status_waiting_reason{%(kubeStateMetricsSelector)s,reason="ContainerCreating"} > 0
)
/
count by (%(clusterLabel)s, node) (
node_namespace_pod:kube_pod_info:{node!=""}
unless on (%(podLabel)s, namespace, %(clusterLabel)s)
kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"} > 0
kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Succeeded"} > 0
)
||| % $._config,
},
Expand Down
64 changes: 46 additions & 18 deletions ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,51 @@ spec:
rules:
- expr: |
max by (cluster, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) (
kube_pod_info{job="kube-state-metrics"}
kube_pod_info{job="kube-state-metrics",node!=""}
* on (cluster, namespace) group_left (workspace)
max by (cluster, namespace, workspace) (kube_namespace_labels{job="kube-state-metrics"})
* on (cluster, namespace, pod) group_left (workload, workload_type)
max by (cluster, namespace, pod, workload, workload_type) (
label_join(
label_join(
kube_pod_owner{job="kube-state-metrics",owner_kind!~"ReplicaSet|DaemonSet|StatefulSet|Job"},
"workload",
"$1",
"owner_name"
),
"workload_type",
"$1",
"owner_kind"
)
or
kube_pod_owner{job="kube-state-metrics",owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"}
* on (cluster, namespace, pod) group_left (workload_type, workload)
namespace_workload_pod:kube_pod_owner:relabel
)
* on (cluster, namespace, pod) group_left (qos_class)
max by (cluster, namespace, pod, qos_class) (
kube_pod_status_qos_class{job="kube-state-metrics"} > 0
)
* on (cluster, node) group_left (role)
max by (cluster, node, role) (
kube_node_info{job="kube-state-metrics"}
* on (cluster, node) group_left (role)
max by (cluster, node, role) (
(
kube_node_role{job="kube-state-metrics",role="worker"}
unless ignoring (role)
kube_node_role{job="kube-state-metrics",role="control-plane"}
)
or
kube_node_role{job="kube-state-metrics",role="control-plane"}
)
or
kube_node_info{job="kube-state-metrics"}
unless on (cluster, node)
kube_node_role{job="kube-state-metrics"}
)
or
kube_pod_info{job="kube-state-metrics",node=""}
* on (cluster, namespace) group_left (workspace)
max by (cluster, namespace, workspace) (kube_namespace_labels{job="kube-state-metrics"})
* on (cluster, namespace, pod) group_left (workload, workload_type)
Expand All @@ -32,29 +76,13 @@ spec:
)
or
kube_pod_owner{job="kube-state-metrics",owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"}
* on (namespace, pod) group_left (workload_type, workload)
* on (cluster, namespace, pod) group_left (workload_type, workload)
namespace_workload_pod:kube_pod_owner:relabel
)
* on (cluster, namespace, pod) group_left (qos_class)
max by (cluster, namespace, pod, qos_class) (
kube_pod_status_qos_class{job="kube-state-metrics"} > 0
)
* on (cluster, node) group_left (role)
max by (cluster, node, role) (
kube_node_info{job="kube-state-metrics"}
* on (cluster, node) group_left (role)
max by (cluster, node, role) (
(
kube_node_role{job="kube-state-metrics",role="worker"}
unless ignoring (role)
kube_node_role{job="kube-state-metrics",role="control-plane"}
)
or
kube_node_role{job="kube-state-metrics",role="control-plane"}
)
or
kube_node_info{job="kube-state-metrics"} unless on(cluster,node) kube_node_role{job="kube-state-metrics"}
)
)
record: 'workspace_workload_node:kube_pod_info:'
- name: whizard-telemetry-node.rules
Expand Down

0 comments on commit a7e9cba

Please sign in to comment.