Skip to content

Commit

Permalink
[kube-prometheus-stack] update recording rule and remove grafana depe…
Browse files Browse the repository at this point in the history
…ndcy

Signed-off-by: frezes <[email protected]>
  • Loading branch information
frezes committed Dec 26, 2023
1 parent 56008f3 commit 526e2bd
Show file tree
Hide file tree
Showing 10 changed files with 282 additions and 87 deletions.
6 changes: 1 addition & 5 deletions charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 52.1.2
version: 52.1.3
appVersion: v0.68.0
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down Expand Up @@ -50,10 +50,6 @@ dependencies:
version: "4.23.*"
repository: https://prometheus-community.github.io/helm-charts
condition: nodeExporter.enabled
- name: grafana
version: "6.60.*"
repository: https://grafana.github.io/helm-charts
condition: grafana.enabled
- name: prometheus-windows-exporter
repository: https://prometheus-community.github.io/helm-charts
version: "0.1.*"
Expand Down
61 changes: 0 additions & 61 deletions charts/kube-prometheus-stack/extension_values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,6 @@ global:
# or
# - "image-pull-secret"

namespaceOverride: kubesphere-monitoring-system

alertmanager:
alertmanagerSpec:
image:
registry: quay.io
repository: prometheus/alertmanager
tag: ""
replicas: 1
resources:
limits:
cpu: 200m
memory: 200Mi
requests:
cpu: 20m
memory: 30Mi

prometheus:
prometheusSpec:
Expand Down Expand Up @@ -77,7 +61,6 @@ prometheusOperator:
memory: 50Mi

kube-state-metrics:
namespaceOverride: kubesphere-monitoring-system
image:
registry: docker.io
repository: kubesphere/kube-state-metrics
Expand All @@ -104,7 +87,6 @@ kube-state-metrics:


prometheus-node-exporter:
namespaceOverride: kubesphere-monitoring-system
image:
registry: quay.io
repository: prometheus/node-exporter
Expand All @@ -129,46 +111,3 @@ prometheus-node-exporter:
cpu: 20m
memory: 20Mi

grafana:
enabled: false
namespaceOverride: kubesphere-monitoring-system
# grafana does not support the global.imageRegistry
image:
repository: grafana/grafana
tag: ""

resources:
limits:
cpu: "1"
memory: 2Gi
requests:
cpu: 100m
memory: 100Mi
persistence:
enabled: true
#storageClassName: default
type: "pvc"
size: 50Mi

sidecar:
image:
repository: quay.io/kiwigrid/k8s-sidecar
tag: 1.24.6
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 50m
memory: 50Mi
initChownData:
image:
repository: busybox
tag: "1.31.1"
resources:
limits:
cpu: 40m
memory: 40Mi
requests:
cpu: 20m
memory: 20Mi
7 changes: 4 additions & 3 deletions charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,10 @@ def new_representer(dumper, data):
'node-exporter.rules': ' .Values.defaultRules.rules.nodeExporterRecording',

# custom rules
'whizard-apiserver-recording.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-cluster-recording.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-node-recording.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-apiserver.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-cluster.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-namespace.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-node.rules': ' .Values.defaultRules.rules.whizardTelemetry',
}

alert_condition_map = {
Expand Down
7 changes: 4 additions & 3 deletions charts/kube-prometheus-stack/templates/prometheus/_rules.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ rules:
- "node.rules"
- "kubelet.rules"
- "node-exporter.rules"
- "whizard-apiserver-recording.rules"
- "whizard-cluster-recording.rules"
- "whizard-node-recording.rules"
- "whizard-telemetry-apiserver.rules"
- "whizard-telemetry-cluster.rules"
- "whizard-telemetry-namespace.rules"
- "whizard-telemetry-node.rules"
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{{- /*
Generated from 'whizard-apiserver-recording.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Generated from 'whizard-telemetry-apiserver.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
Expand All @@ -8,7 +8,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-apiserver-recording.rules" | trunc 63 | trimSuffix "-" }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-apiserver.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
Expand All @@ -22,7 +22,7 @@ metadata:
{{- end }}
spec:
groups:
- name: whizard-apiserver-recording.rules
- name: whizard-telemetry-apiserver.rules
rules:
- expr: sum by(cluster) (irate(apiserver_request_total{job="apiserver"}[5m]))
record: apiserver:apiserver_request_total:sum_irate
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{{- /*
Generated from 'whizard-cluster-recording.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Generated from 'whizard-telemetry-cluster.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
Expand All @@ -8,7 +8,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-cluster-recording.rules" | trunc 63 | trimSuffix "-" }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-cluster.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
Expand All @@ -22,7 +22,7 @@ metadata:
{{- end }}
spec:
groups:
- name: whizard-cluster-recording.rules
- name: whizard-telemetry-cluster.rules
rules:
- expr: |-
max by (cluster, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) (
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{{- /*
Generated from 'whizard-telemetry-namespace.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.whizardTelemetry }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-namespace.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: whizard-telemetry-namespace.rules
rules:
- expr: sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_cpu_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_memory_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_memory_wo_cache_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_net_bytes_received:sum_irate
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_net_bytes_transmitted:sum_irate
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "deamonset", "(.*)")
labels:
workload_type: deamonset
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: namespace:workload_unavalibled_replicas:ratio
- expr: label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, cluster) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, cluster), "workload", "$1", "deployment", "(.*)")
labels:
workload_type: deployment
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: namespace:workload_unavalibled_replicas:ratio
- expr: label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, cluster) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, cluster), "workload", "$1", "statefulset", "(.*)")
labels:
workload_type: statefulset
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: namespace:workload_unavalibled_replicas:ratio
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{{- /*
Generated from 'whizard-node-recording.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Generated from 'whizard-telemetry-node.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
Expand All @@ -8,7 +8,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-node-recording.rules" | trunc 63 | trimSuffix "-" }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-node.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
Expand All @@ -22,7 +22,7 @@ metadata:
{{- end }}
spec:
groups:
- name: whizard-node-recording.rules
- name: whizard-telemetry-node.rules
rules:
- expr: node:node_memory_bytes_used_total:sum / node:node_memory_bytes_total:sum
record: node:node_memory_utilisation:ratio
Expand Down Expand Up @@ -68,6 +68,17 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: node:node_filesystem_bytes_total:sum - node:node_filesystem_bytes_used_total:sum
record: node:node_filesystem_avaliable_bytes_total:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: |-
sum by (cluster, node, instance, host_ip, role) (
max by (cluster, node, instance, host_ip, device) (
Expand Down Expand Up @@ -134,6 +145,17 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: count by (node, host_ip, role, cluster) (node_namespace_pod:kube_pod_info:{node!=""} unless on (pod, namespace, cluster)(kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"} > 0)unless on (pod, namespace, cluster)((kube_pod_status_ready{condition="true",job="kube-state-metrics"} > 0)and on (pod, namespace, cluster)(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} > 0))unless on (pod, namespace, cluster)kube_pod_container_status_waiting_reason{job="kube-state-metrics",reason="ContainerCreating"} > 0)/count by (node, host_ip, role, cluster) (node_namespace_pod:kube_pod_info:{node!=""}unless on (pod, namespace, cluster)kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"}> 0)
record: node:pod_abnormal:ratio
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster,node)(node_load1 / on(cluster,node) node:node_num_cpu:sum) * on(node, cluster) group_left(host_ip, role) max by(node, host_ip, role, cluster) (workspace_workload_node:kube_pod_info:{node!="",host_ip!=""})
record: node:node_load1_per_cpu:ratio
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
Expand Down
Loading

0 comments on commit 526e2bd

Please sign in to comment.