diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index 71013423b90c..65d08ef32996 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -21,7 +21,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 52.1.4 +version: 52.1.5 appVersion: v0.68.0 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-namespace.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-namespace.rules.yaml index 30c8368d2388..43f977ea4e0f 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-namespace.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-namespace.rules.yaml @@ -24,7 +24,7 @@ spec: groups: - name: whizard-telemetry-namespace.rules rules: - - expr: sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + - expr: sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) record: namespace:workload_cpu_usage:sum {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} labels: @@ -35,7 +35,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + - expr: sum by(cluster,namespace,workload,workload_type)(container_memory_usage_bytes{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on (cluster, namespace, pod) group_left (node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}))* on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) record: namespace:workload_memory_usage:sum {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} labels: @@ -46,7 +46,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + - expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) record: namespace:workload_memory_wo_cache_usage:sum {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} labels: @@ -57,7 +57,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + - expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) record: namespace:workload_net_bytes_received:sum_irate {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} labels: @@ -68,7 +68,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + - expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) record: namespace:workload_net_bytes_transmitted:sum_irate {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} labels: @@ -79,9 +79,9 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - - expr: label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "deamonset", "(.*)") + - expr: label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "daemonset", "(.*)") labels: - workload_type: deamonset + workload_type: daemonset {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} {{- with .Values.defaultRules.additionalRuleLabels }} {{- toYaml . | nindent 8 }} @@ -90,7 +90,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - record: namespace:workload_unavalibled_replicas:ratio + record: namespace:workload_unavailable_replicas:ratio - expr: label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, cluster) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, cluster), "workload", "$1", "deployment", "(.*)") labels: workload_type: deployment @@ -102,7 +102,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - record: namespace:workload_unavalibled_replicas:ratio + record: namespace:workload_unavailable_replicas:ratio - expr: label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, cluster) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, cluster), "workload", "$1", "statefulset", "(.*)") labels: workload_type: statefulset @@ -114,5 +114,5 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - record: namespace:workload_unavalibled_replicas:ratio + record: namespace:workload_unavailable_replicas:ratio {{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-node.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-node.rules.yaml index 69795f203e08..2fc21daf162e 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-node.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-node.rules.yaml @@ -35,6 +35,17 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} + - expr: node:node_memory_bytes_total:sum - node:node_memory_bytes_used_total:sum + record: node:node_memory_bytes_available:sum + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} - expr: sum by (cluster, node, instance, host_ip)(node_memory_MemTotal_bytes{job="node-exporter"} -(node_memory_MemAvailable_bytes{job="node-exporter"} or (node_memory_Buffers_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"}))) * on (cluster,node) group_left(role) ((kube_node_role{role="worker"} unless ignoring (role) kube_node_role{role="control-plane"}) or kube_node_role{role="control-plane"}) record: node:node_memory_bytes_used_total:sum {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} diff --git a/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet b/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet index 4627b2a408aa..0577bcbe9fab 100644 --- a/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet +++ b/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet @@ -69,7 +69,7 @@ expr: ||| node:node_memory_bytes_total:sum - node:node_memory_bytes_used_total:sum ||| % $._config, - } + }, { record: 'node:node_memory_bytes_used_total:sum', expr: ||| @@ -207,44 +207,44 @@ { record: 'namespace:workload_cpu_usage:sum', expr: ||| - sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) ||| % $._config, }, { record: 'namespace:workload_memory_usage:sum', expr: ||| - sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + sum by(cluster,namespace,workload,workload_type)(container_memory_usage_bytes{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on (cluster, namespace, pod) group_left (node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}))* on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) ||| % $._config, }, { record: 'namespace:workload_memory_wo_cache_usage:sum', expr: ||| - sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) ||| % $._config, }, { record: 'namespace:workload_net_bytes_received:sum_irate', expr: ||| - sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) ||| % $._config, }, { record: 'namespace:workload_net_bytes_transmitted:sum_irate', expr: ||| - sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) ||| % $._config, }, { - record: 'namespace:workload_unavalibled_replicas:ratio', + record: 'namespace:workload_unavailable_replicas:ratio', expr: ||| - label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, %(clusterLabel)s) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,%(clusterLabel)s), "workload", "$1", "deamonset", "(.*)") + label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, %(clusterLabel)s) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,%(clusterLabel)s), "workload", "$1", "daemonset", "(.*)") ||| % $._config, labels: { - workload_type: 'deamonset', + workload_type: 'daemonset', }, }, { - record: 'namespace:workload_unavalibled_replicas:ratio', + record: 'namespace:workload_unavailable_replicas:ratio', expr: ||| label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, %(clusterLabel)s) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, %(clusterLabel)s), "workload", "$1", "deployment", "(.*)") ||| % $._config, @@ -253,7 +253,7 @@ }, }, { - record: 'namespace:workload_unavalibled_replicas:ratio', + record: 'namespace:workload_unavailable_replicas:ratio', expr: ||| label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, %(clusterLabel)s) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, %(clusterLabel)s), "workload", "$1", "statefulset", "(.*)") ||| % $._config, diff --git a/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml b/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml index 6e00d9252dbe..eb897a8ca958 100644 --- a/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml +++ b/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml @@ -48,6 +48,9 @@ spec: - expr: | node:node_memory_bytes_used_total:sum / node:node_memory_bytes_total:sum record: node:node_memory_utilisation:ratio + - expr: | + node:node_memory_bytes_total:sum - node:node_memory_bytes_used_total:sum + record: node:node_memory_bytes_available:sum - expr: | sum by (cluster, node, instance, host_ip)(node_memory_MemTotal_bytes{job="node-exporter"} -(node_memory_MemAvailable_bytes{job="node-exporter"} or (node_memory_Buffers_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"}))) * on (cluster,node) group_left(role) ((kube_node_role{role="worker"} unless ignoring (role) kube_node_role{role="control-plane"}) or kube_node_role{role="control-plane"}) record: node:node_memory_bytes_used_total:sum @@ -120,35 +123,35 @@ spec: - name: whizard-telemetry-namespace.rules rules: - expr: | - sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) record: namespace:workload_cpu_usage:sum - expr: | - sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + sum by(cluster,namespace,workload,workload_type)(container_memory_usage_bytes{image!="",job="kubelet",metrics_path="/metrics/cadvisor"} * on (cluster, namespace, pod) group_left (node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}))* on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) record: namespace:workload_memory_usage:sum - expr: | - sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) record: namespace:workload_memory_wo_cache_usage:sum - expr: | - sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) record: namespace:workload_net_bytes_received:sum_irate - expr: | - sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,namespace,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) record: namespace:workload_net_bytes_transmitted:sum_irate - expr: | - label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "deamonset", "(.*)") + label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "daemonset", "(.*)") labels: - workload_type: deamonset - record: namespace:workload_unavalibled_replicas:ratio + workload_type: daemonset + record: namespace:workload_unavailable_replicas:ratio - expr: | label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, cluster) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, cluster), "workload", "$1", "deployment", "(.*)") labels: workload_type: deployment - record: namespace:workload_unavalibled_replicas:ratio + record: namespace:workload_unavailable_replicas:ratio - expr: | label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, cluster) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, cluster), "workload", "$1", "statefulset", "(.*)") labels: workload_type: statefulset - record: namespace:workload_unavalibled_replicas:ratio + record: namespace:workload_unavailable_replicas:ratio - name: whizard-telemetry-apiserver.rules rules: - expr: |