Skip to content

Commit

Permalink
[kube-prometheus-stack] update recording rules
Browse files Browse the repository at this point in the history
Signed-off-by: frezes <[email protected]>
  • Loading branch information
frezes committed Jan 31, 2024
1 parent c343aaf commit e3e9c6b
Show file tree
Hide file tree
Showing 7 changed files with 656 additions and 148 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum(irate(apiserver_request_total{job="apiserver"}[5m])) by (verb, cluster)
- expr: sum by (cluster, verb)(irate(apiserver_request_total{job="apiserver"}[5m]))
record: apiserver:apiserver_request_total:sum_verb_irate
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand All @@ -57,7 +57,7 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum(irate(apiserver_request_duration_seconds_sum{job="apiserver",subresource!="log", verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) by (verb, cluster) / sum(irate(apiserver_request_duration_seconds_count{job="apiserver", subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) by (verb, cluster)
- expr: sum by (cluster, verb)(irate(apiserver_request_duration_seconds_sum{job="apiserver",subresource!="log", verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m])) / sum by (cluster, verb)(irate(apiserver_request_duration_seconds_count{job="apiserver", subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"}[5m]))
record: apiserver:apiserver_request_duration:avg_by_verb
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ spec:
rules:
- expr: |-
max by (cluster, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) (
kube_pod_info * on (cluster, namespace) group_left (workspace) (kube_namespace_labels{job="kube-state-metrics"})
kube_pod_info
* on (cluster, namespace) group_left (workspace)
max by (cluster, namespace, workspace) (kube_namespace_labels{job="kube-state-metrics"})
* on (cluster, namespace, pod) group_left (workload, workload_type)
(
max by (cluster, namespace, pod, workload, workload_type) (
label_join(
label_join(
kube_pod_owner{owner_kind!~"ReplicaSet|DaemonSet|StatefulSet|Job",job="kube-state-metrics"},
kube_pod_owner{job="kube-state-metrics",owner_kind!~"ReplicaSet|DaemonSet|StatefulSet|Job"},
"workload",
"$1",
"owner_name"
Expand All @@ -41,17 +43,23 @@ spec:
"owner_kind"
)
or
kube_pod_owner{owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job",job="kube-state-metrics"}
kube_pod_owner{job="kube-state-metrics",owner_kind=~"ReplicaSet|DaemonSet|StatefulSet|Job"}
* on (namespace, pod) group_left (workload_type, workload)
namespace_workload_pod:kube_pod_owner:relabel
)
* on (cluster, namespace, pod) group_left (qos_class)
(kube_pod_status_qos_class{job="kube-state-metrics"} > 0)
max by (cluster, namespace, pod, qos_class) (
kube_pod_status_qos_class{job="kube-state-metrics"} > 0
)
* on (cluster, node) group_left (role)
(
(kube_node_role{role="worker",job="kube-state-metrics"} unless ignoring (role) kube_node_role{role="control-plane",job="kube-state-metrics"})
max by (cluster, node, role) (
(
kube_node_role{job="kube-state-metrics",role="worker"}
unless ignoring (role)
kube_node_role{job="kube-state-metrics",role="control-plane"}
)
or
kube_node_role{role="control-plane",job="kube-state-metrics"}
kube_node_role{job="kube-state-metrics",role="control-plane"}
)
)
record: 'workspace_workload_node:kube_pod_info:'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,14 @@ spec:
groups:
- name: whizard-telemetry-namespace.rules
rules:
- expr: sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
- expr: |-
sum by (cluster, namespace, workload, workload_type) (
sum by (cluster, namespace, pod) (
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
)
* on (cluster, namespace, pod) group_left (workload, workload_type)
workspace_workload_node:kube_pod_info:
)
record: namespace:workload_cpu_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand All @@ -35,7 +42,14 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by(cluster,namespace,workload,workload_type)(container_memory_usage_bytes{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on (cluster, namespace, pod) group_left (node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}))* on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
- expr: |-
sum by (cluster, namespace, workload, workload_type) (
sum by (cluster, namespace, pod) (
container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
)
* on (cluster, namespace, pod) group_left (workload, workload_type)
workspace_workload_node:kube_pod_info:
)
record: namespace:workload_memory_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand All @@ -46,7 +60,14 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
- expr: |-
sum by (cluster, namespace, workload, workload_type) (
sum by (cluster, namespace, pod) (
node_namespace_pod_container:container_memory_working_set_bytes
)
* on (cluster, namespace, pod) group_left (workload, workload_type)
workspace_workload_node:kube_pod_info:
)
record: namespace:workload_memory_wo_cache_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand All @@ -57,7 +78,14 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
- expr: |-
sum by (cluster, namespace, workload, workload_type) (
sum by (cluster, namespace, pod) (
irate(container_network_receive_bytes_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
)
* on (cluster, namespace, pod) group_left (workload, workload_type)
workspace_workload_node:kube_pod_info:
)
record: namespace:workload_net_bytes_received:sum_irate
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand All @@ -68,7 +96,14 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
- expr: |-
sum by (cluster, namespace, workload, workload_type) (
sum by (cluster, namespace, pod) (
irate(container_network_transmit_bytes_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
)
* on (cluster, namespace, pod) group_left (workload, workload_type)
workspace_workload_node:kube_pod_info:
)
record: namespace:workload_net_bytes_transmitted:sum_irate
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
Expand Down
Loading

0 comments on commit e3e9c6b

Please sign in to comment.