Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[kube-prometheus-stack] update recording rule and remove grafana dependcy #35

Merged
merged 1 commit into from
Dec 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 52.1.3
version: 52.1.4
appVersion: v0.68.0
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down Expand Up @@ -50,10 +50,6 @@ dependencies:
version: "4.23.*"
repository: https://prometheus-community.github.io/helm-charts
condition: nodeExporter.enabled
- name: grafana
version: "6.60.*"
repository: https://grafana.github.io/helm-charts
condition: grafana.enabled
- name: prometheus-windows-exporter
repository: https://prometheus-community.github.io/helm-charts
version: "0.1.*"
Expand Down
61 changes: 0 additions & 61 deletions charts/kube-prometheus-stack/extension_values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,6 @@ global:
# or
# - "image-pull-secret"

namespaceOverride: kubesphere-monitoring-system

alertmanager:
alertmanagerSpec:
image:
registry: quay.io
repository: prometheus/alertmanager
tag: ""
replicas: 1
resources:
limits:
cpu: 200m
memory: 200Mi
requests:
cpu: 20m
memory: 30Mi

prometheus:
prometheusSpec:
Expand Down Expand Up @@ -77,7 +61,6 @@ prometheusOperator:
memory: 50Mi

kube-state-metrics:
namespaceOverride: kubesphere-monitoring-system
image:
registry: docker.io
repository: kubesphere/kube-state-metrics
Expand All @@ -104,7 +87,6 @@ kube-state-metrics:


prometheus-node-exporter:
namespaceOverride: kubesphere-monitoring-system
image:
registry: quay.io
repository: prometheus/node-exporter
Expand All @@ -129,46 +111,3 @@ prometheus-node-exporter:
cpu: 20m
memory: 20Mi

grafana:
enabled: false
namespaceOverride: kubesphere-monitoring-system
# grafana does not support the global.imageRegistry
image:
repository: grafana/grafana
tag: ""

resources:
limits:
cpu: "1"
memory: 2Gi
requests:
cpu: 100m
memory: 100Mi
persistence:
enabled: true
#storageClassName: default
type: "pvc"
size: 50Mi

sidecar:
image:
repository: quay.io/kiwigrid/k8s-sidecar
tag: 1.24.6
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 50m
memory: 50Mi
initChownData:
image:
repository: busybox
tag: "1.31.1"
resources:
limits:
cpu: 40m
memory: 40Mi
requests:
cpu: 20m
memory: 20Mi
7 changes: 4 additions & 3 deletions charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,10 @@ def new_representer(dumper, data):
'node-exporter.rules': ' .Values.defaultRules.rules.nodeExporterRecording',

# custom rules
'whizard-apiserver-recording.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-cluster-recording.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-node-recording.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-apiserver.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-cluster.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-namespace.rules': ' .Values.defaultRules.rules.whizardTelemetry',
'whizard-telemetry-node.rules': ' .Values.defaultRules.rules.whizardTelemetry',
}

alert_condition_map = {
Expand Down
7 changes: 4 additions & 3 deletions charts/kube-prometheus-stack/templates/prometheus/_rules.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ rules:
- "node.rules"
- "kubelet.rules"
- "node-exporter.rules"
- "whizard-apiserver-recording.rules"
- "whizard-cluster-recording.rules"
- "whizard-node-recording.rules"
- "whizard-telemetry-apiserver.rules"
- "whizard-telemetry-cluster.rules"
- "whizard-telemetry-namespace.rules"
- "whizard-telemetry-node.rules"
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{{- /*
Generated from 'whizard-apiserver-recording.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Generated from 'whizard-telemetry-apiserver.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
Expand All @@ -8,7 +8,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-apiserver-recording.rules" | trunc 63 | trimSuffix "-" }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-apiserver.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
Expand All @@ -22,7 +22,7 @@ metadata:
{{- end }}
spec:
groups:
- name: whizard-apiserver-recording.rules
- name: whizard-telemetry-apiserver.rules
rules:
- expr: sum by(cluster) (irate(apiserver_request_total{job="apiserver"}[5m]))
record: apiserver:apiserver_request_total:sum_irate
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{{- /*
Generated from 'whizard-cluster-recording.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Generated from 'whizard-telemetry-cluster.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
Expand All @@ -8,7 +8,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-cluster-recording.rules" | trunc 63 | trimSuffix "-" }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-cluster.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
Expand All @@ -22,7 +22,7 @@ metadata:
{{- end }}
spec:
groups:
- name: whizard-cluster-recording.rules
- name: whizard-telemetry-cluster.rules
rules:
- expr: |-
max by (cluster, node, workspace, namespace, pod, qos_class, workload, workload_type, role, host_ip) (
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{{- /*
Generated from 'whizard-telemetry-namespace.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.whizardTelemetry }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-namespace.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
{{- if .Values.defaultRules.labels }}
{{ toYaml .Values.defaultRules.labels | indent 4 }}
{{- end }}
{{- if .Values.defaultRules.annotations }}
annotations:
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
{{- end }}
spec:
groups:
- name: whizard-telemetry-namespace.rules
rules:
- expr: sum by(cluster,namespace,workload,workload_type)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_cpu_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_memory_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (node_namespace_pod_container:container_memory_working_set_bytes{} * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_memory_wo_cache_usage:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_receive_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_net_bytes_received:sum_irate
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster, namespace,workload,workload_type) (irate(container_network_transmit_bytes_total{namespace!="", pod!=""}[5m]) * on(cluster,pod)group_left(workload,workload_type) workspace_workload_node:kube_pod_info:{})
record: namespace:workload_net_bytes_transmitted:sum_irate
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: label_replace(sum(kube_daemonset_status_number_unavailable{job="kube-state-metrics"}) by (daemonset, namespace, cluster) / sum(kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}) by (daemonset, namespace,cluster), "workload", "$1", "deamonset", "(.*)")
labels:
workload_type: deamonset
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: namespace:workload_unavalibled_replicas:ratio
- expr: label_replace(sum(kube_deployment_status_replicas_unavailable{job="kube-state-metrics"}) by (deployment, namespace, cluster) / sum(kube_deployment_spec_replicas{job="kube-state-metrics"}) by (deployment, namespace, cluster), "workload", "$1", "deployment", "(.*)")
labels:
workload_type: deployment
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: namespace:workload_unavalibled_replicas:ratio
- expr: label_replace(1 - sum(kube_statefulset_status_replicas_ready{job="kube-state-metrics"}) by (statefulset, namespace, cluster) / sum(kube_statefulset_status_replicas{job="kube-state-metrics"}) by (statefulset, namespace, cluster), "workload", "$1", "statefulset", "(.*)")
labels:
workload_type: statefulset
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
record: namespace:workload_unavalibled_replicas:ratio
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{{- /*
Generated from 'whizard-node-recording.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Generated from 'whizard-telemetry-node.rules' group from file://../../../ks-prometheus/manifests/whizard-telemetry-prometheusRule.yaml
Do not change in-place! In order to change this file first read following link:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
*/ -}}
Expand All @@ -8,7 +8,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-node-recording.rules" | trunc 63 | trimSuffix "-" }}
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "whizard-telemetry-node.rules" | trunc 63 | trimSuffix "-" }}
namespace: {{ template "kube-prometheus-stack.namespace" . }}
labels:
app: {{ template "kube-prometheus-stack.name" . }}
Expand All @@ -22,7 +22,7 @@ metadata:
{{- end }}
spec:
groups:
- name: whizard-node-recording.rules
- name: whizard-telemetry-node.rules
rules:
- expr: node:node_memory_bytes_used_total:sum / node:node_memory_bytes_total:sum
record: node:node_memory_utilisation:ratio
Expand Down Expand Up @@ -68,6 +68,17 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: node:node_filesystem_bytes_total:sum - node:node_filesystem_bytes_used_total:sum
record: node:node_filesystem_avaliable_bytes_total:sum
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: |-
sum by (cluster, node, instance, host_ip, role) (
max by (cluster, node, instance, host_ip, device) (
Expand Down Expand Up @@ -134,6 +145,17 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: count by (node, host_ip, role, cluster) (node_namespace_pod:kube_pod_info:{node!=""} unless on (pod, namespace, cluster)(kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"} > 0)unless on (pod, namespace, cluster)((kube_pod_status_ready{condition="true",job="kube-state-metrics"} > 0)and on (pod, namespace, cluster)(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} > 0))unless on (pod, namespace, cluster)kube_pod_container_status_waiting_reason{job="kube-state-metrics",reason="ContainerCreating"} > 0)/count by (node, host_ip, role, cluster) (node_namespace_pod:kube_pod_info:{node!=""}unless on (pod, namespace, cluster)kube_pod_status_phase{job="kube-state-metrics",phase="Succeeded"}> 0)
record: node:pod_abnormal:ratio
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
labels:
{{- with .Values.defaultRules.additionalRuleLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
- expr: sum by (cluster,node)(node_load1 / on(cluster,node) node:node_num_cpu:sum) * on(node, cluster) group_left(host_ip, role) max by(node, host_ip, role, cluster) (workspace_workload_node:kube_pod_info:{node!="",host_ip!=""})
record: node:node_load1_per_cpu:ratio
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.whizardTelemetry }}
Expand Down
Loading