From 9d2a2a63b2b4adfb639deca4680fb499d1eef6bc Mon Sep 17 00:00:00 2001 From: Sohamdg081992 <31517098+Sohamdg081992@users.noreply.github.com> Date: Mon, 16 Oct 2023 09:31:49 -0700 Subject: [PATCH] Updating the prod monitoring alerts and update ref app node selector to schedule on specific node pool (#633) * Removing duplicate alerts from ci recommended alerts * Remove test branch * Remove preview keyword from policy readme * Updating the prod monitoring alerts and update ref app node selector to schedule on specific node pool * . --- internal/alerts/example-alert-template.json | 58 +++++++++---------- .../prometheus-reference-app.yaml | 1 + 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/internal/alerts/example-alert-template.json b/internal/alerts/example-alert-template.json index ce865c59d..e9a7a5a45 100644 --- a/internal/alerts/example-alert-template.json +++ b/internal/alerts/example-alert-template.json @@ -21,7 +21,7 @@ { "alert": "Amd64 metric missing in cluster ci-dev-aks-mac-eus", "expression": "absent(node_uname_info{machine=\"x86_64\"}) == 1 or node_uname_info{machine=\"x86_64\"} == 0", - "for": "PT3M", + "for": "PT30M", "annotations": { "description": "Amd64 metric missing in cluster ci-dev-aks-mac-eus" }, @@ -38,8 +38,8 @@ }, { "alert": "up metric missing for target = node in cluster ci-dev-aks-mac-eus", - "expression": "absent(up{job=\"node\"}) == 1 or up{job=\"node\"} == 0", - "for": "PT3M", + "expression": "absent_over_time(up{job=\"node\"}[30m]) == 1 or count(up{job=\"node\"} == 1) == 0", + "for": "PT30M", "annotations": { "description": "up metric is not flowing for target = node in cluster ci-dev-aks-mac-eus" }, @@ -56,8 +56,8 @@ }, { "alert": "up metric missing for target = kubelet in cluster ci-dev-aks-mac-eus", - "expression": "absent(up{job=\"kubelet\"}) == 1 or up{job=\"kubelet\"} == 0", - "for": "PT3M", + "expression": "absent_over_time(up{job=\"kubelet\"}[30m]) == 1 or count(up{job=\"kubelet\"} == 1) == 0", + "for": "PT30M", "annotations": { "description": "up metric is not flowing for target = kubelet in cluster ci-dev-aks-mac-eus" }, @@ -74,8 +74,8 @@ }, { "alert": "up metric missing for target = windows-exporter in cluster ci-dev-aks-mac-eus", - "expression": "absent(up{job=\"windows-exporter\"}) == 1 or up{job=\"windows-exporter\"} == 0", - "for": "PT3M", + "expression": "absent_over_time(up{job=\"windows-exporter\"}[30m]) == 1 or count(up{job=\"windows-exporter\"} == 1) == 0", + "for": "PT30M", "annotations": { "description": "up metric is not flowing for target = windows-exporter in cluster ci-dev-aks-mac-eus" }, @@ -92,8 +92,8 @@ }, { "alert": "up metric missing for target = kube-proxy in cluster ci-dev-aks-mac-eus", - "expression": "absent(up{job=\"kube-proxy\"}) == 1 or up{job=\"kube-proxy\"} == 0", - "for": "PT3M", + "expression": "absent_over_time(up{job=\"kube-proxy\"}[30m]) == 1 or count(up{job=\"kube-proxy\"} == 1) == 0", + "for": "PT30M", "annotations": { "description": "up metric is not flowing for target = kube-proxy in cluster ci-dev-aks-mac-eus" }, @@ -110,8 +110,8 @@ }, { "alert": "up metric missing for target = kube-apiserver in cluster ci-dev-aks-mac-eus", - "expression": "absent(up{job=\"kube-apiserver\"}) == 1 or up{job=\"kube-apiserver\"} == 0", - "for": "PT3M", + "expression": "absent_over_time(up{job=\"kube-apiserver\"}[30m]) == 1 or count(up{job=\"kube-apiserver\"} == 1) == 0", + "for": "PT30M", "annotations": { "description": "up metric is not flowing for target = kube-apiserver in cluster ci-dev-aks-mac-eus" }, @@ -128,8 +128,8 @@ }, { "alert": "up metric missing for target = kube-proxy-windows in cluster ci-dev-aks-mac-eus", - "expression": "absent(up{job=\"kube-proxy-windows\"}) == 1 or up{job=\"kube-proxy-windows\"} == 0", - "for": "PT3M", + "expression": "absent_over_time(up{job=\"kube-proxy-windows\"}[30m]) == 1 or count(up{job=\"kube-proxy-windows\"} == 1) == 0", + "for": "PT30M", "annotations": { "description": "up metric is not flowing for target = kube-proxy-windows in cluster ci-dev-aks-mac-eus" }, @@ -146,8 +146,8 @@ }, { "alert": "up metric missing for target = kube-state-metrics in cluster ci-dev-aks-mac-eus", - "expression": "absent(up{job=\"kube-state-metrics\"}) == 1 or up{job=\"kube-state-metrics\"} == 0", - "for": "PT3M", + "expression": "absent_over_time(up{job=\"kube-state-metrics\"}[30m]) == 1 or count(up{job=\"kube-state-metrics\"} == 1) == 0", + "for": "PT30M", "annotations": { "description": "up metric is not flowing for target = kube-state-metrics in cluster ci-dev-aks-mac-eus" }, @@ -164,8 +164,8 @@ }, { "alert": "up metric missing for target = cadvisor in cluster ci-dev-aks-mac-eus", - "expression": "absent(up{job=\"cadvisor\"}) == 1 or up{job=\"cadvisor\"} == 0", - "for": "PT3M", + "expression": "absent_over_time(up{job=\"cadvisor\"}[30m]) == 1 or count(up{job=\"cadvisor\"} == 1) == 0", + "for": "PT30M", "annotations": { "description": "up metric is not flowing for target = cadvisor in cluster ci-dev-aks-mac-eus" }, @@ -182,8 +182,8 @@ }, { "alert": "up metric missing for target = kube-dns in cluster ci-dev-aks-mac-eus", - "expression": "absent(up{job=\"kube-dns\"}) == 1 or up{job=\"kube-dns\"} == 0", - "for": "PT3M", + "expression": "absent_over_time(up{job=\"kube-dns\"}[30m]) == 1 or count(up{job=\"kube-dns\"} == 1) == 0", + "for": "PT30M", "annotations": { "description": "up metric is not flowing for target = kube-dns in cluster ci-dev-aks-mac-eus" }, @@ -199,11 +199,11 @@ ] }, { - "alert": "CPU usage % greater than 90 for prometheus-collector containers on cluster ci-dev-aks-mac-eus", - "expression": "sum(sum by (cluster, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{job=\"cadvisor\", image!=\"\", namespace=\"kube-system\", container=\"prometheus-collector\"}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\", namespace=\"kube-system\"}) )) by (container, pod) > 0.9", + "alert": "CPU usage % greater than 75 for prometheus-collector containers on cluster ci-dev-aks-mac-eus", + "expression": "sum(sum by (cluster, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{job=\"cadvisor\", image!=\"\", namespace=\"kube-system\", container=\"prometheus-collector\"}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\", namespace=\"kube-system\"}) )) by (container, pod) *100 > 75", "for": "PT3M", "annotations": { - "description": "CPU usage greater than 90% for prometheus-collector on cluster ci-dev-aks-mac-eus" + "description": "CPU usage greater than 75% for prometheus-collector on cluster ci-dev-aks-mac-eus" }, "severity": 4, "resolveConfiguration": { @@ -217,11 +217,11 @@ ] }, { - "alert": "CPU usage % greater than 50 for prometheus-collector containers on cluster ci-dev-aks-mac-eus", - "expression": "sum(sum by (cluster, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{job=\"cadvisor\", image!=\"\", namespace=\"kube-system\", container=\"prometheus-collector\"}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\", namespace=\"kube-system\"}) )) by (container, pod) > 0.5", + "alert": "Memory usage % greater than 75 for prometheus-collector containers on cluster ci-dev-aks-mac-eus", + "expression": "(sum(container_memory_working_set_bytes{namespace=\"kube-system\", container=\"prometheus-collector\", image!=\"\"}) by (container, pod) / sum(kube_pod_container_resource_limits{namespace=\"kube-system\", container=\"prometheus-collector\", resource=\"memory\"}) by (container, pod)) > 75", "for": "PT3M", "annotations": { - "description": "CPU usage greater than 5% for prometheus-collector on cluster ci-dev-aks-mac-eus" + "description": "Memory usage greater than 75% for prometheus-collector containers on cluster ci-dev-aks-mac-eus" }, "severity": 4, "resolveConfiguration": { @@ -235,11 +235,11 @@ ] }, { - "alert": "Memory usage is high for prometheus-collector containers on cluster ci-dev-aks-mac-eus", - "expression": "(sum(container_memory_working_set_bytes{namespace=\"kube-system\", container=\"prometheus-collector\", image!=\"\"}) by (container, pod) / sum(kube_pod_container_resource_requests{namespace=\"kube-system\", container=\"prometheus-collector\", resource=\"memory\"}) by (container, pod)) > 1.9", - "for": "PT3M", + "alert": "Custom job metric missing for target = prometheus_ref_app in cluster ci-dev-aks-mac-eus", + "expression": "absent_over_time(myapp_rainfall_histogram_sum[30m]) == 1 or count(myapp_rainfall_histogram_sum == 1) == 0", + "for": "PT30M", "annotations": { - "description": "Memory usage is high for prometheus-collector containers on cluster ci-dev-aks-mac-eus" + "description": "Custom job metric missing for target = prometheus_ref_app in cluster ci-dev-aks-mac-eus" }, "severity": 4, "resolveConfiguration": { diff --git a/internal/referenceapp/prometheus-reference-app.yaml b/internal/referenceapp/prometheus-reference-app.yaml index 759cfa546..816a27085 100644 --- a/internal/referenceapp/prometheus-reference-app.yaml +++ b/internal/referenceapp/prometheus-reference-app.yaml @@ -34,6 +34,7 @@ spec: protocol: TCP nodeSelector: kubernetes.io/os: linux + architecture: amd64 --- apiVersion: v1 kind: Service