From 538ec92c24052c41e1b8b7e5d89e27b7496df53a Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Fri, 30 Jun 2023 11:41:36 -0700 Subject: [PATCH 1/5] Removing duplicate alerts from ci recommended alerts --- .../templates/ci_recommended_alerts.json | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json b/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json index 85a5263e0..0c0f8a615 100644 --- a/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json +++ b/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json @@ -46,44 +46,6 @@ "enabled": true, "interval": "PT5M", "rules": [ - { - "alert": "Average CPU usage per container is greater than 95%", - "expression": "sum (rate(container_cpu_usage_seconds_total{image!=\"\", container!=\"POD\"}[5m])) by (pod,cluster,container,namespace) / sum(container_spec_cpu_quota{image!=\"\", container!=\"POD\"}/container_spec_cpu_period{image!=\"\", container!=\"POD\"}) by (pod,cluster,container,namespace) > .95", - "for": "PT5M", - "annotations": { - "description": "Average CPU usage per container is greater than 95%" - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT15M" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "Average Memory usage per container is greater than 95%.", - "expression": "(container_memory_working_set_bytes{container!=\"\", image!=\"\", container!=\"POD\"} / on(namespace,cluster,pod,container) group_left kube_pod_container_resource_limits{resource=\"memory\", node!=\"\"}) > .95 ", - "for": "PT10M", - "annotations": { - "description": "Average Memory usage per container is greater than 95%" - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, { "alert": "Number of OOM killed containers is greater than 0", "expression": "sum by (cluster,container,namespace)(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}) > 0", From 428070693aa5a58e504563047b39011eb3753b2e Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Tue, 18 Jul 2023 10:34:20 -0700 Subject: [PATCH 2/5] Remove test branch --- .pipelines/azure-pipeline-build.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.pipelines/azure-pipeline-build.yml b/.pipelines/azure-pipeline-build.yml index 674b36be2..13354c152 100644 --- a/.pipelines/azure-pipeline-build.yml +++ b/.pipelines/azure-pipeline-build.yml @@ -2,7 +2,6 @@ trigger: branches: include: - main - - incpubsizesoham pr: autoCancel: true branches: From 12ad6c4f46cbb67644c885c70dae40c129521d7b Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Thu, 10 Aug 2023 09:45:50 -0700 Subject: [PATCH 3/5] Remove preview keyword from policy readme --- AddonPolicyTemplate/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AddonPolicyTemplate/README.md b/AddonPolicyTemplate/README.md index a68cbd34c..84392e8f5 100644 --- a/AddonPolicyTemplate/README.md +++ b/AddonPolicyTemplate/README.md @@ -1,6 +1,6 @@ You can create the policy definition using a command like : -```az policy definition create --name "(Preview) Prometheus Metrics addon" --display-name "(Preview) Prometheus Metrics addon" --mode Indexed --metadata version=1.0.0 category=Kubernetes --rules .\AddonPolicyMetricsProfile.rules.json --params .\AddonPolicyMetricsProfile.parameters.json``` +```az policy definition create --name "Prometheus Metrics addon" --display-name "Prometheus Metrics addon" --mode Indexed --metadata version=1.0.0 category=Kubernetes --rules .\AddonPolicyMetricsProfile.rules.json --params .\AddonPolicyMetricsProfile.parameters.json``` **NOTE** From 82344f2ac4d3f72fad1bb28e71c97e87266941db Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Mon, 6 Nov 2023 16:52:26 -0800 Subject: [PATCH 4/5] update for clause in agent version alert --- internal/alerts/example-alert-template.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/alerts/example-alert-template.json b/internal/alerts/example-alert-template.json index 46884a2a0..31141e87b 100644 --- a/internal/alerts/example-alert-template.json +++ b/internal/alerts/example-alert-template.json @@ -255,13 +255,13 @@ { "alert": "New agent version found for prometheus collector", "expression": "count(count (kube_pod_container_info{image=~\"mcr.microsoft.com/azuremonitor/containerinsights/ciprod/prometheus-collector.*\"}) by (image)) > 2", - "for": "PT30M", + "for": "PT60S", "annotations": { "description": "New agent version found for prometheus collector. This alert is only used in near ring regions for prod monitoring clusters" }, "severity": 4, "resolveConfiguration": { - "autoResolved": true, + "autoResolved": false, "timeToResolve": "PT10M" }, "actions": [ From a6a6ca20c529af73c5e57b837cac8b38e2513250 Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Wed, 29 Nov 2023 15:29:58 -0800 Subject: [PATCH 5/5] Update config with new uids --- mixins/coredns/config.libsonnet | 2 +- mixins/kubernetes/config.libsonnet | 44 +++++++++++++++--------------- mixins/node/config.libsonnet | 8 +++--- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/mixins/coredns/config.libsonnet b/mixins/coredns/config.libsonnet index 8380dd190..88a28f13d 100644 --- a/mixins/coredns/config.libsonnet +++ b/mixins/coredns/config.libsonnet @@ -4,7 +4,7 @@ instanceLabel: 'pod', grafanaDashboardIDs: { - 'coredns.json': 'ddcc77bf776f4f5f97660c85e1e96738', + 'coredns.json': 'ddcc78cf776f4f5f97660c85e1e96738', }, pluginNameLabel: 'name', diff --git a/mixins/kubernetes/config.libsonnet b/mixins/kubernetes/config.libsonnet index d852026bb..f3801ad75 100644 --- a/mixins/kubernetes/config.libsonnet +++ b/mixins/kubernetes/config.libsonnet @@ -36,42 +36,42 @@ // Grafana dashboard IDs are necessary for stable links for dashboards grafanaDashboardIDs: { //non-default - 'k8s-resources-multicluster.json': 'e6fd5c5a88514d61af69c0d97ee76738', - 'k8s-resources-cluster.json': 'efa86fd1d0c121a26444b636a3f56738', - 'k8s-resources-namespace.json': '85a562078cdf77779eaa1add43cc6738', - 'k8s-resources-pod.json': '6581e46e4e5c7ba40a07646395ef6738', + 'k8s-resources-multicluster.json': 'ec943b19c8e54dc587ce68d7438e6738', + 'k8s-resources-cluster.json': 'fd0cac08a3f34e2994cf904627836738', + 'k8s-resources-namespace.json': '6385dfe4b7f54710aa1f748b34ba6738', + 'k8s-resources-pod.json': 'ac3253a2c4a149d68ccd0a58c7ab6738', //not-used 'k8s-multicluster-rsrc-use.json': 'NJ9AlnsObVgj9uKiJMeAqfzMi1wihOMupcsDhlhR', //not-used 'k8s-cluster-rsrc-use.json': 'uXQldxzqUNgIOUX6FyZNvqgP2vgYb78daNu4GiDc', //not-used 'k8s-node-rsrc-use.json': 'E577CMUOwmPsxVVqM9lj40czM1ZPjclw7hGa7OT7', // ? Confirm if I need to add the new dashboards here //not-used 'nodes.json': 'kcb9C2QDe4IYcjiTOmYyfhsImuzxRcvwWC3YLJPS', //non-default - 'persistentvolumesusage.json': '497766c2c6ea4851b6b4397cb8a96738', + 'persistentvolumesusage.json': 'e618c3c758db4ff093eb7c8059896738', //not-used 'pods.json': 'AMK9hS0rSbSz7cKjPHcOtk6CGHFjhSHwhbQ3sedK', //not-used 'statefulset.json': 'dPiBt0FRG5BNYo0XJ4L0Meoc7DWs9eL40c1CRc1g', 'k8s-resources-windows-cluster.json': '6438557df4391b100730f2494bcc6738', - 'k8s-resources-windows-namespace.json': '98e54027a2724ab1d4c45666c1fa6738', - 'k8s-resources-windows-pod.json': '56497a7ea5610e936dc6ed374a7c6738', - 'k8s-windows-cluster-rsrc-use.json': 'VESDB6738', - 'k8s-windows-node-rsrc-use.json': 'YCBDf6738', - 'k8s-resources-workloads-namespace.json': 'a87fb0d919ec0ea5f6543124e16c6738', - 'k8s-resources-workload.json': 'a164a7f0339f99e89cea5cb47e9b6738', - 'apiserver.json': 'efe630eb6d9d4888ac542cad7a666738', - 'controller-manager.json': '3aa700ed75ce4c64ba52ef5ca23f6738', - 'scheduler.json': '0252eb9a5da7445a8787400871546738', - 'proxy.json': '6cc85d728d7245aeaa630a3486206738', - 'kubelet.json': '3138fa155d5915769fbded898ac06738', + 'k8s-resources-windows-namespace.json': '9f84792794e34121bd0fa99075d96738', + 'k8s-resources-windows-pod.json': '78070a924a2f4fe4ad515a90f19c6738', + 'k8s-windows-cluster-rsrc-use.json': 'VPLDB6738', + 'k8s-windows-node-rsrc-use.json': 'YDBDf6738', + 'k8s-resources-workloads-namespace.json': '2745ce2b859a40f7990ff6b85d736738', + 'k8s-resources-workload.json': '3151475894614845ba54456099696738', + 'apiserver.json': 'e76da360d12a41968447be5de9756738', + 'controller-manager.json': '3bb700ed75ce4c64ba53ef5ca23f6738', + 'scheduler.json': '3334208071584af2b85f29f4fe6b6738', + 'proxy.json': 'c167a6c9d4154ead836af5c702096738', + 'kubelet.json': '184244a28b3d478e9c0de82def316738', //newly added non-default - 'workload-total.json': 'a11d4aaa68bb4018b6a83623ca046738', + 'workload-total.json': '1067870b91dc476e93e973c66d666738', //non-default - 'pod-total.json': 'e7f918d9d1aa4d37a3933c0f9a816738', + 'pod-total.json': '388ce1735ac7484fbc24a173ce926738', //non-default - 'namespace-by-workload.json': '2043d0bedbc24793aecdc5ed7dc16738', + 'namespace-by-workload.json': '06f22b5c9e224d43a9af3cdba8e96738', //non-default - 'namespace-by-pod.json': '6fd059f91b894b499a13fada53606738', - 'k8s-resources-node.json': '200ac8fdbfbb74b39aff88118e4d6738', + 'namespace-by-pod.json': 'e89424f17baa4bed90b572f081eb6738', + 'k8s-resources-node.json': '7857fbef7cd44823a509c7dfbd166738', //non-default - 'cluster-total.json': '2fdf7ea1f7c04f028a220e7835066738', + 'cluster-total.json': 'bd1240a5c4d24bb09da405d0af1c6738', }, diff --git a/mixins/node/config.libsonnet b/mixins/node/config.libsonnet index 5b2456b06..5d480d299 100644 --- a/mixins/node/config.libsonnet +++ b/mixins/node/config.libsonnet @@ -63,11 +63,11 @@ // Grafana dashboard IDs are necessary for stable links for dashboards grafanaDashboardIDs: { - 'nodes.json': 'D4pVs6738', - 'node-rsrc-use.json': 't5aja6738', + 'nodes.json': 'D3pVs6738', + 'node-rsrc-use.json': 't6aja6738', //non-default - 'node-cluster-rsrc-use.json': 'Vdeba6738', - 'node-multicluster-rsrc-use.json': 'abcjOM6738', + 'node-cluster-rsrc-use.json': 'Vdera6738', + 'node-multicluster-rsrc-use.json': 'abdjOM6738', }, }, }