From 526e2bd2569d6f5bbe43a379c15eaa8f7d78eb04 Mon Sep 17 00:00:00 2001 From: frezes Date: Mon, 25 Dec 2023 15:53:33 +0800 Subject: [PATCH] [kube-prometheus-stack] update recording rule and remove grafana dependcy Signed-off-by: frezes --- charts/kube-prometheus-stack/Chart.yaml | 6 +- .../extension_values.yaml | 61 --------- .../hack/sync_prometheus_rules.py | 7 +- .../templates/prometheus/_rules.tpl | 7 +- ...=> whizard-telemetry-apiserver.rules.yaml} | 6 +- ...l => whizard-telemetry-cluster.rules.yaml} | 6 +- .../whizard-telemetry-namespace.rules.yaml | 118 ++++++++++++++++++ ...yaml => whizard-telemetry-node.rules.yaml} | 28 ++++- .../rules/custom.libsonnet | 86 ++++++++++++- .../whizard-telemetry-prometheusRule.yaml | 44 ++++++- 10 files changed, 282 insertions(+), 87 deletions(-) rename charts/kube-prometheus-stack/templates/prometheus/rules-1.14/{whizard-apiserver-recording.rules.yaml => whizard-telemetry-apiserver.rules.yaml} (95%) rename charts/kube-prometheus-stack/templates/prometheus/rules-1.14/{whizard-cluster-recording.rules.yaml => whizard-telemetry-cluster.rules.yaml} (94%) create mode 100644 charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-namespace.rules.yaml rename charts/kube-prometheus-stack/templates/prometheus/rules-1.14/{whizard-node-recording.rules.yaml => whizard-telemetry-node.rules.yaml} (88%) diff --git a/charts/kube-prometheus-stack/Chart.yaml b/charts/kube-prometheus-stack/Chart.yaml index c1cb86e64449..edf10cea8734 100644 --- a/charts/kube-prometheus-stack/Chart.yaml +++ b/charts/kube-prometheus-stack/Chart.yaml @@ -21,7 +21,7 @@ name: kube-prometheus-stack sources: - https://github.com/prometheus-community/helm-charts - https://github.com/prometheus-operator/kube-prometheus -version: 52.1.2 +version: 52.1.3 appVersion: v0.68.0 kubeVersion: ">=1.19.0-0" home: https://github.com/prometheus-operator/kube-prometheus @@ -50,10 +50,6 @@ dependencies: version: "4.23.*" repository: https://prometheus-community.github.io/helm-charts condition: nodeExporter.enabled - - name: grafana - version: "6.60.*" - repository: https://grafana.github.io/helm-charts - condition: grafana.enabled - name: prometheus-windows-exporter repository: https://prometheus-community.github.io/helm-charts version: "0.1.*" diff --git a/charts/kube-prometheus-stack/extension_values.yaml b/charts/kube-prometheus-stack/extension_values.yaml index 11f3d505062f..ef857a1ba4ee 100644 --- a/charts/kube-prometheus-stack/extension_values.yaml +++ b/charts/kube-prometheus-stack/extension_values.yaml @@ -11,22 +11,6 @@ global: # or # - "image-pull-secret" -namespaceOverride: kubesphere-monitoring-system - -alertmanager: - alertmanagerSpec: - image: - registry: quay.io - repository: prometheus/alertmanager - tag: "" - replicas: 1 - resources: - limits: - cpu: 200m - memory: 200Mi - requests: - cpu: 20m - memory: 30Mi prometheus: prometheusSpec: @@ -77,7 +61,6 @@ prometheusOperator: memory: 50Mi kube-state-metrics: - namespaceOverride: kubesphere-monitoring-system image: registry: docker.io repository: kubesphere/kube-state-metrics @@ -104,7 +87,6 @@ kube-state-metrics: prometheus-node-exporter: - namespaceOverride: kubesphere-monitoring-system image: registry: quay.io repository: prometheus/node-exporter @@ -129,46 +111,3 @@ prometheus-node-exporter: cpu: 20m memory: 20Mi -grafana: - enabled: false - namespaceOverride: kubesphere-monitoring-system - # grafana does not support the global.imageRegistry - image: - repository: grafana/grafana - tag: "" - - resources: - limits: - cpu: "1" - memory: 2Gi - requests: - cpu: 100m - memory: 100Mi - persistence: - enabled: true - #storageClassName: default - type: "pvc" - size: 50Mi - - sidecar: - image: - repository: quay.io/kiwigrid/k8s-sidecar - tag: 1.24.6 - resources: - limits: - cpu: 100m - memory: 100Mi - requests: - cpu: 50m - memory: 50Mi - initChownData: - image: - repository: busybox - tag: "1.31.1" - resources: - limits: - cpu: 40m - memory: 40Mi - requests: - cpu: 20m - memory: 20Mi \ No newline at end of file diff --git a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py index ad0ca14b1df6..23c4e0f45c96 100755 --- a/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py +++ b/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py @@ -113,9 +113,10 @@ def new_representer(dumper, data): 'node-exporter.rules': ' .Values.defaultRules.rules.nodeExporterRecording', # custom rules - 'whizard-apiserver-recording.rules': ' .Values.defaultRules.rules.whizardTelemetry', - 'whizard-cluster-recording.rules': ' .Values.defaultRules.rules.whizardTelemetry', - 'whizard-node-recording.rules': ' .Values.defaultRules.rules.whizardTelemetry', + 'whizard-telemetry-apiserver.rules': ' .Values.defaultRules.rules.whizardTelemetry', + 'whizard-telemetry-cluster.rules': ' .Values.defaultRules.rules.whizardTelemetry', + 'whizard-telemetry-namespace.rules': ' .Values.defaultRules.rules.whizardTelemetry', + 'whizard-telemetry-node.rules': ' .Values.defaultRules.rules.whizardTelemetry', } alert_condition_map = { diff --git a/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl b/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl index 82def9e71f90..1b9d41f8e792 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl +++ b/charts/kube-prometheus-stack/templates/prometheus/_rules.tpl @@ -14,7 +14,8 @@ rules: - "node.rules" - "kubelet.rules" - "node-exporter.rules" - - "whizard-apiserver-recording.rules" - - "whizard-cluster-recording.rules" - - "whizard-node-recording.rules" + - "whizard-telemetry-apiserver.rules" + - "whizard-telemetry-cluster.rules" + - "whizard-telemetry-namespace.rules" + - "whizard-telemetry-node.rules" {{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-apiserver-recording.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-apiserver.rules.yaml similarity index 95% rename from charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-apiserver-recording.rules.yaml rename to charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-apiserver.rules.yaml index 667fc36a5c17..643f72de09e4 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-apiserver-recording.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-apiserver.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'whizard-apiserver-recording.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml +Generated from 'whizard-telemetry-apiserver.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -8,7 +8,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-apiserver-recording.rules" | trunc 63 | trimSuffix "-" }} + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-apiserver.rules" | trunc 63 | trimSuffix "-" }} namespace: {{ template "kube-prometheus-stack.namespace" . }} labels: app: {{ template "kube-prometheus-stack.name" . }} @@ -22,7 +22,7 @@ metadata: {{- end }} spec: groups: - - name: whizard-apiserver-recording.rules + - name: whizard-telemetry-apiserver.rules rules: - expr: sum by(cluster) (irate(apiserver_request_total{job="apiserver"}[5m])) record: apiserver:apiserver_request_total:sum_irate diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-cluster-recording.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-cluster.rules.yaml similarity index 94% rename from charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-cluster-recording.rules.yaml rename to charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-cluster.rules.yaml index 52f3f2c5bd3c..9eded575ccd4 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-cluster-recording.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-cluster.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'whizard-cluster-recording.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml +Generated from 'whizard-telemetry-cluster.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -8,7 +8,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-cluster-recording.rules" | trunc 63 | trimSuffix "-" }} + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-cluster.rules" | trunc 63 | trimSuffix "-" }} namespace: {{ template "kube-prometheus-stack.namespace" . }} labels: app: {{ template "kube-prometheus-stack.name" . }} @@ -22,7 +22,7 @@ metadata: {{- end }} spec: groups: - - name: whizard-cluster-recording.rules + - name: whizard-telemetry-cluster.rules rules: - expr: |- max by (cluster, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) ( diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-namespace.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-namespace.rules.yaml new file mode 100644 index 000000000000..30c8368d2388 --- /dev/null +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-namespace.rules.yaml @@ -0,0 +1,118 @@ +{{- /* +Generated from 'whizard-telemetry-namespace.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml +Do not change in-place! In order to change this file first read following link: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack +*/ -}} +{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} +{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.whizardTelemetry }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-namespace.rules" | trunc 63 | trimSuffix "-" }} + namespace: {{ template "kube-prometheus-stack.namespace" . }} + labels: + app: {{ template "kube-prometheus-stack.name" . }} +{{ include "kube-prometheus-stack.labels" . | indent 4 }} +{{- if .Values.defaultRules.labels }} +{{ toYaml .Values.defaultRules.labels | indent 4 }} +{{- end }} +{{- if .Values.defaultRules.annotations }} + annotations: +{{ toYaml .Values.defaultRules.annotations | indent 4 }} +{{- end }} +spec: + groups: + - name: whizard-telemetry-namespace.rules + rules: + - expr: sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + record: namespace:workload_cpu_usage:sum + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + record: namespace:workload_memory_usage:sum + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + record: namespace:workload_memory_wo_cache_usage:sum + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + record: namespace:workload_net_bytes_received:sum_irate + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + record: namespace:workload_net_bytes_transmitted:sum_irate + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + - expr: label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "deamonset", "(.*)") + labels: + workload_type: deamonset + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: namespace:workload_unavalibled_replicas:ratio + - expr: label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, cluster) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, cluster), "workload", "$1", "deployment", "(.*)") + labels: + workload_type: deployment + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: namespace:workload_unavalibled_replicas:ratio + - expr: label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, cluster) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, cluster), "workload", "$1", "statefulset", "(.*)") + labels: + workload_type: statefulset + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + record: namespace:workload_unavalibled_replicas:ratio +{{- end }} \ No newline at end of file diff --git a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-node-recording.rules.yaml b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-node.rules.yaml similarity index 88% rename from charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-node-recording.rules.yaml rename to charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-node.rules.yaml index add0e3892d1d..69795f203e08 100644 --- a/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-node-recording.rules.yaml +++ b/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/whizard-telemetry-node.rules.yaml @@ -1,5 +1,5 @@ {{- /* -Generated from 'whizard-node-recording.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml +Generated from 'whizard-telemetry-node.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} @@ -8,7 +8,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-node-recording.rules" | trunc 63 | trimSuffix "-" }} + name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-node.rules" | trunc 63 | trimSuffix "-" }} namespace: {{ template "kube-prometheus-stack.namespace" . }} labels: app: {{ template "kube-prometheus-stack.name" . }} @@ -22,7 +22,7 @@ metadata: {{- end }} spec: groups: - - name: whizard-node-recording.rules + - name: whizard-telemetry-node.rules rules: - expr: node:node_memory_bytes_used_total:sum / node:node_memory_bytes_total:sum record: node:node_memory_utilisation:ratio @@ -68,6 +68,17 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} + - expr: node:node_filesystem_bytes_total:sum - node:node_filesystem_bytes_used_total:sum + record: node:node_filesystem_avaliable_bytes_total:sum + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} - expr: |- sum by (cluster, node, instance, host_ip, role) ( max by (cluster, node, instance, host_ip, device) ( @@ -134,6 +145,17 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} {{- end }} + - expr: count by (node, host_ip, role, cluster) (node_namespace_pod:kube_pod_info:{node!=""} unless on (pod, namespace, cluster)(kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"} > 0)unless on (pod, namespace, cluster)((kube_pod_status_ready{condition="true",job="kube-state-metrics"} > 0)and on (pod, namespace, cluster)(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} > 0))unless on (pod, namespace, cluster)kube_pod_container_status_waiting_reason{job="kube-state-metrics",reason="ContainerCreating"} > 0)/count by (node, host_ip, role, cluster) (node_namespace_pod:kube_pod_info:{node!=""}unless on (pod, namespace, cluster)kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"}> 0) + record: node:pod_abnormal:ratio + {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + labels: + {{- with .Values.defaultRules.additionalRuleLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} - expr: sum by (cluster,node)(node_load1 / on(cluster,node) node:node_num_cpu:sum) * on(node, cluster) group_left(host_ip, role) max by(node, host_ip, role, cluster) (workspace_workload_node:kube_pod_info:{node!="",host_ip!=""}) record: node:node_load1_per_cpu:ratio {{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }} diff --git a/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet b/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet index 28412c7715f4..4627b2a408aa 100644 --- a/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet +++ b/ks-prometheus/components/whizard-telemetry-mixin/rules/custom.libsonnet @@ -14,7 +14,7 @@ prometheusRules+:: { groups+: [ { - name: 'whizard-telemetry-cluster-recording.rules', + name: 'whizard-telemetry-cluster.rules', rules: [ { // pod attribute tuple tuple (cluster, node, workspace, namespace, pod, qos_class, workload, workload_type, node_role, host_ip) ==> 1 @@ -56,7 +56,7 @@ ], }, { - name: 'whizard-telemetry-node-recording.rules', + name: 'whizard-telemetry-node.rules', rules: [ { record: 'node:node_memory_utilisation:ratio', @@ -64,6 +64,12 @@ node:node_memory_bytes_used_total:sum / node:node_memory_bytes_total:sum ||| % $._config, }, + { + record: 'node:node_memory_bytes_available:sum', + expr: ||| + node:node_memory_bytes_total:sum - node:node_memory_bytes_used_total:sum + ||| % $._config, + } { record: 'node:node_memory_bytes_used_total:sum', expr: ||| @@ -82,6 +88,12 @@ node:node_filesystem_bytes_used_total:sum / node:node_filesystem_bytes_total:sum ||| % $._config, }, + { + record: 'node:node_filesystem_avaliable_bytes_total:sum', + expr: ||| + node:node_filesystem_bytes_total:sum - node:node_filesystem_bytes_used_total:sum + ||| % $._config, + }, { record: 'node:node_filesystem_bytes_used_total:sum', expr: ||| @@ -121,6 +133,12 @@ sum by (cluster,node,host_ip,role)(kube_node_status_allocatable{resource="pods"} * on (cluster, node) (kube_node_status_condition{condition="Ready",status="true"}) * on(node, cluster) group_left(host_ip, role) max by(node, host_ip, role, cluster) (workspace_workload_node:kube_pod_info:{node!="",host_ip!=""})) ||| % $._config, }, + { + record: 'node:pod_abnormal:ratio', + expr: ||| + count by (node, host_ip, role, cluster) (node_namespace_pod:kube_pod_info:{node!=""} unless on (pod, namespace, cluster)(kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"} > 0)unless on (pod, namespace, cluster)((kube_pod_status_ready{condition="true",job="kube-state-metrics"} > 0)and on (pod, namespace, cluster)(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} > 0))unless on (pod, namespace, cluster)kube_pod_container_status_waiting_reason{job="kube-state-metrics",reason="ContainerCreating"} > 0)/count by (node, host_ip, role, cluster) (node_namespace_pod:kube_pod_info:{node!=""}unless on (pod, namespace, cluster)kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"}> 0) + ||| % $._config, + }, { record: 'node:node_load1_per_cpu:ratio', expr: ||| @@ -184,7 +202,69 @@ ], }, { - name: 'whizard-telemetry-apiserver-recording.rules', + name: 'whizard-telemetry-namespace.rules', + rules: [ + { + record: 'namespace:workload_cpu_usage:sum', + expr: ||| + sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + ||| % $._config, + }, + { + record: 'namespace:workload_memory_usage:sum', + expr: ||| + sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + ||| % $._config, + }, + { + record: 'namespace:workload_memory_wo_cache_usage:sum', + expr: ||| + sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + ||| % $._config, + }, + { + record: 'namespace:workload_net_bytes_received:sum_irate', + expr: ||| + sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + ||| % $._config, + }, + { + record: 'namespace:workload_net_bytes_transmitted:sum_irate', + expr: ||| + sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + ||| % $._config, + }, + { + record: 'namespace:workload_unavalibled_replicas:ratio', + expr: ||| + label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, %(clusterLabel)s) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,%(clusterLabel)s), "workload", "$1", "deamonset", "(.*)") + ||| % $._config, + labels: { + workload_type: 'deamonset', + }, + }, + { + record: 'namespace:workload_unavalibled_replicas:ratio', + expr: ||| + label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, %(clusterLabel)s) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, %(clusterLabel)s), "workload", "$1", "deployment", "(.*)") + ||| % $._config, + labels: { + workload_type: 'deployment', + }, + }, + { + record: 'namespace:workload_unavalibled_replicas:ratio', + expr: ||| + label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, %(clusterLabel)s) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, %(clusterLabel)s), "workload", "$1", "statefulset", "(.*)") + ||| % $._config, + labels: { + workload_type: 'statefulset', + }, + }, + ] + }, + { + name: 'whizard-telemetry-apiserver.rules', rules: [ { record: 'apiserver:apiserver_request_total:sum_irate', diff --git a/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml b/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml index 03f66d480459..6e00d9252dbe 100644 --- a/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml +++ b/ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml @@ -10,7 +10,7 @@ metadata: namespace: kubesphere-monitoring-system spec: groups: - - name: whizard-telemetry-cluster-recording.rules + - name: whizard-telemetry-cluster.rules rules: - expr: | max by (cluster, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) ( @@ -43,7 +43,7 @@ spec: ) ) record: 'workspace_workload_node:kube_pod_info:' - - name: whizard-telemetry-node-recording.rules + - name: whizard-telemetry-node.rules rules: - expr: | node:node_memory_bytes_used_total:sum / node:node_memory_bytes_total:sum @@ -57,6 +57,9 @@ spec: - expr: | node:node_filesystem_bytes_used_total:sum / node:node_filesystem_bytes_total:sum record: node:node_filesystem_utilisation:ratio + - expr: | + node:node_filesystem_bytes_total:sum - node:node_filesystem_bytes_used_total:sum + record: node:node_filesystem_avaliable_bytes_total:sum - expr: | sum by (cluster, node, instance, host_ip, role) ( max by (cluster, node, instance, host_ip, device) ( @@ -81,6 +84,9 @@ spec: - expr: | sum by (cluster,node,host_ip,role)(kube_node_status_allocatable{resource="pods"} * on (cluster, node) (kube_node_status_condition{condition="Ready",status="true"}) * on(node, cluster) group_left(host_ip, role) max by(node, host_ip, role, cluster) (workspace_workload_node:kube_pod_info:{node!="",host_ip!=""})) record: node:node_pod_quota:sum + - expr: | + count by (node, host_ip, role, cluster) (node_namespace_pod:kube_pod_info:{node!=""} unless on (pod, namespace, cluster)(kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"} > 0)unless on (pod, namespace, cluster)((kube_pod_status_ready{condition="true",job="kube-state-metrics"} > 0)and on (pod, namespace, cluster)(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} > 0))unless on (pod, namespace, cluster)kube_pod_container_status_waiting_reason{job="kube-state-metrics",reason="ContainerCreating"} > 0)/count by (node, host_ip, role, cluster) (node_namespace_pod:kube_pod_info:{node!=""}unless on (pod, namespace, cluster)kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"}> 0) + record: node:pod_abnormal:ratio - expr: | sum by (cluster,node)(node_load1 / on(cluster,node) node:node_num_cpu:sum) * on(node, cluster) group_left(host_ip, role) max by(node, host_ip, role, cluster) (workspace_workload_node:kube_pod_info:{node!="",host_ip!=""}) record: node:node_load1_per_cpu:ratio @@ -111,7 +117,39 @@ spec: - expr: | sum by (node, cluster)(node_filesystem_files{job="node-exporter", device=~"/dev/.*",device!~"/dev/loop\\d+"} - node_filesystem_files_free{job="node-exporter", device=~"/dev/.*",device!~"/dev/loop\\d+"}) * on(node, cluster) group_left(host_ip, role) max by(node, host_ip, role, cluster) (workspace_workload_node:kube_pod_info:{node!="",host_ip!=""}) record: node:node_inodes_used_total:sum - - name: whizard-telemetry-apiserver-recording.rules + - name: whizard-telemetry-namespace.rules + rules: + - expr: | + sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + record: namespace:workload_cpu_usage:sum + - expr: | + sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + record: namespace:workload_memory_usage:sum + - expr: | + sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + record: namespace:workload_memory_wo_cache_usage:sum + - expr: | + sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + record: namespace:workload_net_bytes_received:sum_irate + - expr: | + sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{}) + record: namespace:workload_net_bytes_transmitted:sum_irate + - expr: | + label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "deamonset", "(.*)") + labels: + workload_type: deamonset + record: namespace:workload_unavalibled_replicas:ratio + - expr: | + label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, cluster) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, cluster), "workload", "$1", "deployment", "(.*)") + labels: + workload_type: deployment + record: namespace:workload_unavalibled_replicas:ratio + - expr: | + label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, cluster) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, cluster), "workload", "$1", "statefulset", "(.*)") + labels: + workload_type: statefulset + record: namespace:workload_unavalibled_replicas:ratio + - name: whizard-telemetry-apiserver.rules rules: - expr: | sum by(cluster) (irate(apiserver_request_total{job="apiserver"}[5m]))