From 538ec92c24052c41e1b8b7e5d89e27b7496df53a Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Fri, 30 Jun 2023 11:41:36 -0700 Subject: [PATCH 1/5] Removing duplicate alerts from ci recommended alerts --- .../templates/ci_recommended_alerts.json | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json b/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json index 85a5263e0..0c0f8a615 100644 --- a/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json +++ b/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json @@ -46,44 +46,6 @@ "enabled": true, "interval": "PT5M", "rules": [ - { - "alert": "Average CPU usage per container is greater than 95%", - "expression": "sum (rate(container_cpu_usage_seconds_total{image!=\"\", container!=\"POD\"}[5m])) by (pod,cluster,container,namespace) / sum(container_spec_cpu_quota{image!=\"\", container!=\"POD\"}/container_spec_cpu_period{image!=\"\", container!=\"POD\"}) by (pod,cluster,container,namespace) > .95", - "for": "PT5M", - "annotations": { - "description": "Average CPU usage per container is greater than 95%" - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT15M" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "Average Memory usage per container is greater than 95%.", - "expression": "(container_memory_working_set_bytes{container!=\"\", image!=\"\", container!=\"POD\"} / on(namespace,cluster,pod,container) group_left kube_pod_container_resource_limits{resource=\"memory\", node!=\"\"}) > .95 ", - "for": "PT10M", - "annotations": { - "description": "Average Memory usage per container is greater than 95%" - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, { "alert": "Number of OOM killed containers is greater than 0", "expression": "sum by (cluster,container,namespace)(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}) > 0", From 428070693aa5a58e504563047b39011eb3753b2e Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Tue, 18 Jul 2023 10:34:20 -0700 Subject: [PATCH 2/5] Remove test branch --- .pipelines/azure-pipeline-build.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.pipelines/azure-pipeline-build.yml b/.pipelines/azure-pipeline-build.yml index 674b36be2..13354c152 100644 --- a/.pipelines/azure-pipeline-build.yml +++ b/.pipelines/azure-pipeline-build.yml @@ -2,7 +2,6 @@ trigger: branches: include: - main - - incpubsizesoham pr: autoCancel: true branches: From 12ad6c4f46cbb67644c885c70dae40c129521d7b Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Thu, 10 Aug 2023 09:45:50 -0700 Subject: [PATCH 3/5] Remove preview keyword from policy readme --- AddonPolicyTemplate/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AddonPolicyTemplate/README.md b/AddonPolicyTemplate/README.md index a68cbd34c..84392e8f5 100644 --- a/AddonPolicyTemplate/README.md +++ b/AddonPolicyTemplate/README.md @@ -1,6 +1,6 @@ You can create the policy definition using a command like : -```az policy definition create --name "(Preview) Prometheus Metrics addon" --display-name "(Preview) Prometheus Metrics addon" --mode Indexed --metadata version=1.0.0 category=Kubernetes --rules .\AddonPolicyMetricsProfile.rules.json --params .\AddonPolicyMetricsProfile.parameters.json``` +```az policy definition create --name "Prometheus Metrics addon" --display-name "Prometheus Metrics addon" --mode Indexed --metadata version=1.0.0 category=Kubernetes --rules .\AddonPolicyMetricsProfile.rules.json --params .\AddonPolicyMetricsProfile.parameters.json``` **NOTE** From 41ccd8e5d2caaec1752048c84f9bc7c474a1d7f4 Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Tue, 19 Sep 2023 18:43:01 -0700 Subject: [PATCH 4/5] Add cluster id scope to rule groups for alerts to be linked to UX --- .../FullAzureMonitorMetricsProfile.json | 12 ++++++++---- .../WindowsRecordingRules.json | 8 +++++--- .../AzureMonitorAlertsProfile.bicep | 8 ++------ .../FullAzureMonitorMetricsProfile.bicep | 16 ++++------------ .../AddonPolicyMetricsProfile.rules.json | 16 ++++++++++++---- .../Default/DefaultAlerts.json | 9 ++++++++- .../Default/DefaultRecordingRules.json | 11 ++++++++++- .../templates/ci_recommended_alerts.json | 9 ++++++++- 8 files changed, 57 insertions(+), 32 deletions(-) diff --git a/AddonArmTemplate/FullAzureMonitorMetricsProfile.json b/AddonArmTemplate/FullAzureMonitorMetricsProfile.json index 7fb9c5f89..f6432a023 100644 --- a/AddonArmTemplate/FullAzureMonitorMetricsProfile.json +++ b/AddonArmTemplate/FullAzureMonitorMetricsProfile.json @@ -221,7 +221,8 @@ "properties": { "description": "[concat(variables('nodeRecordingRuleGroupDescription'), variables('version'))]", "scopes": [ - "[parameters('azureMonitorWorkspaceResourceId')]" + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" ], "clusterName": "[variables('clusterName')]", "interval": "PT1M", @@ -281,7 +282,8 @@ "properties": { "description": "[concat(variables('kubernetesRecordingRuleGroupDescription'), variables('version'))]", "scopes": [ - "[parameters('azureMonitorWorkspaceResourceId')]" + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" ], "clusterName": "[variables('clusterName')]", "interval": "PT1M", @@ -385,7 +387,8 @@ "properties": { "description": "[concat(variables('RecordingRuleGroupDescriptionWin'), variables('version'))]", "scopes": [ - "[parameters('azureMonitorWorkspaceResourceId')]" + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" ], "enabled": "[parameters('enableWindowsRecordingRules')]", "clusterName": "[variables('clusterName')]", @@ -462,7 +465,8 @@ "properties": { "description": "[concat(variables('RecordingRuleGroupDescriptionWin'), variables('version'))]", "scopes": [ - "[parameters('azureMonitorWorkspaceResourceId')]" + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" ], "enabled": "[parameters('enableWindowsRecordingRules')]", "clusterName": "[variables('clusterName')]", diff --git a/AddonArmTemplate/WindowsRecordingRuleGroupTemplate/WindowsRecordingRules.json b/AddonArmTemplate/WindowsRecordingRuleGroupTemplate/WindowsRecordingRules.json index 68d58498a..69d9e6d49 100644 --- a/AddonArmTemplate/WindowsRecordingRuleGroupTemplate/WindowsRecordingRules.json +++ b/AddonArmTemplate/WindowsRecordingRuleGroupTemplate/WindowsRecordingRules.json @@ -56,7 +56,8 @@ "properties": { "description": "[concat(variables('RecordingRuleGroupDescriptionWin'), variables('version'))]", "scopes": [ - "[parameters('azureMonitorWorkspaceResourceId')]" + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" ], "enabled": true, "clusterName": "[variables('clusterName')]", @@ -133,7 +134,8 @@ "properties": { "description": "[concat(variables('RecordingRuleGroupDescriptionWin'), variables('version'))]", "scopes": [ - "[parameters('azureMonitorWorkspaceResourceId')]" + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" ], "enabled": true, "clusterName": "[variables('clusterName')]", @@ -211,4 +213,4 @@ } } ] -} \ No newline at end of file +} diff --git a/AddonBicepTemplate/AzureMonitorAlertsProfile.bicep b/AddonBicepTemplate/AzureMonitorAlertsProfile.bicep index 7949150c2..6ed9a1b83 100644 --- a/AddonBicepTemplate/AzureMonitorAlertsProfile.bicep +++ b/AddonBicepTemplate/AzureMonitorAlertsProfile.bicep @@ -16,9 +16,7 @@ resource recommendedAlerts 'Microsoft.AlertsManagement/prometheusRuleGroups@2023 location: location properties: { description: 'Kubernetes Alert RuleGroup-RecommendedCIAlerts - 0.1' - scopes: [ - monitorWorkspace.id - ] + scopes: [monitorWorkspace.id,aksResourceId] clusterName: split(aksResourceId, '/')[8] enabled: true interval: 'PT5M' @@ -241,9 +239,7 @@ resource communityALerts 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-0 location: location properties: { description: 'Kubernetes Alert RuleGroup-communityCIAlerts - 0.1' - scopes: [ - monitorWorkspace.id - ] + scopes: [monitorWorkspace.id,aksResourceId] clusterName: split(aksResourceId, '/')[8] enabled: true interval: 'PT1M' diff --git a/AddonBicepTemplate/FullAzureMonitorMetricsProfile.bicep b/AddonBicepTemplate/FullAzureMonitorMetricsProfile.bicep index c906b2e87..58715bc12 100644 --- a/AddonBicepTemplate/FullAzureMonitorMetricsProfile.bicep +++ b/AddonBicepTemplate/FullAzureMonitorMetricsProfile.bicep @@ -141,9 +141,7 @@ resource nodeRecordingRuleGroup 'Microsoft.AlertsManagement/prometheusRuleGroups location: azureMonitorWorkspaceLocation properties: { description: '${nodeRecordingRuleGroupDescription}${version}' - scopes: [ - azureMonitorWorkspaceResourceId - ] + scopes: [azureMonitorWorkspaceResourceId,clusterResourceId] enabled: true clusterName: clusterName interval: 'PT1M' @@ -201,9 +199,7 @@ resource kubernetesRecordingRuleGroup 'Microsoft.AlertsManagement/prometheusRule location: azureMonitorWorkspaceLocation properties: { description: '${kubernetesRecordingRuleGroupDescription}${version}' - scopes: [ - azureMonitorWorkspaceResourceId - ] + scopes: [azureMonitorWorkspaceResourceId,clusterResourceId] enabled: true clusterName: clusterName interval: 'PT1M' @@ -305,9 +301,7 @@ resource nodeRecordingRuleGroupNameWin 'Microsoft.AlertsManagement/prometheusRul location: azureMonitorWorkspaceLocation properties: { description: '${RecordingRuleGroupDescriptionWin}${version}' - scopes: [ - azureMonitorWorkspaceResourceId - ] + scopes: [azureMonitorWorkspaceResourceId,clusterResourceId] enabled: enableWindowsRecordingRules clusterName: clusterName interval: 'PT1M' @@ -381,9 +375,7 @@ resource nodeAndKubernetesRecordingRuleGroupNameWin 'Microsoft.AlertsManagement/ location: azureMonitorWorkspaceLocation properties: { description: '${RecordingRuleGroupDescriptionWin}${version}' - scopes: [ - azureMonitorWorkspaceResourceId - ] + scopes: [azureMonitorWorkspaceResourceId,clusterResourceId] enabled: enableWindowsRecordingRules clusterName: clusterName interval: 'PT1M' diff --git a/AddonPolicyTemplate/AddonPolicyMetricsProfile.rules.json b/AddonPolicyTemplate/AddonPolicyMetricsProfile.rules.json index 8137837a0..297d10301 100644 --- a/AddonPolicyTemplate/AddonPolicyMetricsProfile.rules.json +++ b/AddonPolicyTemplate/AddonPolicyMetricsProfile.rules.json @@ -225,7 +225,10 @@ "location": "[parameters('azureMonitorWorkspaceLocation')]", "properties": { "description": "[concat(variables('nodeRecordingRuleGroupDescription'), variables('version'))]", - "scopes": ["[parameters('azureMonitorWorkspaceResourceId')]"], + "scopes": [ + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" + ], "clusterName": "[variables('clusterName')]", "interval": "PT1M", "rules": [ @@ -283,7 +286,10 @@ "location": "[parameters('azureMonitorWorkspaceLocation')]", "properties": { "description": "[concat(variables('kubernetesRecordingRuleGroupDescription'), variables('version'))]", - "scopes": ["[parameters('azureMonitorWorkspaceResourceId')]"], + "scopes": [ + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" + ], "clusterName": "[variables('clusterName')]", "interval": "PT1M", "rules": [ @@ -386,7 +392,8 @@ "properties": { "description": "[concat(variables('RecordingRuleGroupDescriptionWin'), variables('version'))]", "scopes": [ - "[parameters('azureMonitorWorkspaceResourceId')]" + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" ], "enabled": "[parameters('enableWindowsRecordingRules')]", "clusterName": "[variables('clusterName')]", @@ -463,7 +470,8 @@ "properties": { "description": "[concat(variables('RecordingRuleGroupDescriptionWin'), variables('version'))]", "scopes": [ - "[parameters('azureMonitorWorkspaceResourceId')]" + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" ], "enabled": "[parameters('enableWindowsRecordingRules')]", "clusterName": "[variables('clusterName')]", diff --git a/GeneratedMonitoringArtifacts/Default/DefaultAlerts.json b/GeneratedMonitoringArtifacts/Default/DefaultAlerts.json index f28eaeb50..d931a73f0 100644 --- a/GeneratedMonitoringArtifacts/Default/DefaultAlerts.json +++ b/GeneratedMonitoringArtifacts/Default/DefaultAlerts.json @@ -8,6 +8,12 @@ "description": "Cluster name" } }, + "clusterResourceId": { + "type": "string", + "metadata": { + "description": "Cluster Resource Id" + } + }, "actionGroupResourceId": { "type": "string", "metadata": { @@ -40,7 +46,8 @@ "properties": { "description": "[concat(variables('kubernetesAlertRuleGroupDescription'), variables('version'))]", "scopes": [ - "[parameters('azureMonitorWorkspaceResourceId')]" + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" ], "clusterName": "[parameters('clusterName')]", "interval": "PT1M", diff --git a/GeneratedMonitoringArtifacts/Default/DefaultRecordingRules.json b/GeneratedMonitoringArtifacts/Default/DefaultRecordingRules.json index 4eb17e815..75054d763 100644 --- a/GeneratedMonitoringArtifacts/Default/DefaultRecordingRules.json +++ b/GeneratedMonitoringArtifacts/Default/DefaultRecordingRules.json @@ -8,6 +8,12 @@ "description": "Cluster name" } }, + "clusterResourceId": { + "type": "string", + "metadata": { + "description": "Cluster Resource Id" + } + }, "azureMonitorWorkspaceResourceId": { "type": "string", "metadata": { @@ -41,7 +47,10 @@ "location": "[parameters('location')]", "properties": { "description": "[concat(variables('nodeRecordingRuleGroupDescription'), variables('version'))]", - "scopes": [ "[parameters('azureMonitorWorkspaceResourceId')]" ], + "scopes": [ + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" + ], "enabled": true, "clusterName": "[parameters('clusterName')]", "interval": "PT1M", diff --git a/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json b/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json index 13cdb3027..6143de054 100644 --- a/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json +++ b/mixins/kubernetes/rules/recording_and_alerting_rules/templates/ci_recommended_alerts.json @@ -8,6 +8,12 @@ "description": "Cluster name" } }, + "clusterResourceId": { + "type": "string", + "metadata": { + "description": "Cluster Resource Id" + } + }, "actionGroupResourceId": { "type": "string", "metadata": { @@ -40,7 +46,8 @@ "properties": { "description": "[concat(variables('kubernetesAlertRuleGroupDescription'), variables('version'))]", "scopes": [ - "[parameters('azureMonitorWorkspaceResourceId')]" + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" ], "clusterName": "[parameters('clusterName')]", "enabled": true, From 71a921211be0397019f82dbe7afa0d2f395968ba Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Tue, 26 Sep 2023 22:41:21 -0700 Subject: [PATCH 5/5] adding terraform update --- AddonTerraformTemplate/main.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/AddonTerraformTemplate/main.tf b/AddonTerraformTemplate/main.tf index 1d386e2d8..b3593d983 100644 --- a/AddonTerraformTemplate/main.tf +++ b/AddonTerraformTemplate/main.tf @@ -119,7 +119,7 @@ resource "azurerm_monitor_alert_prometheus_rule_group" "node_recording_rules_rul description = "Node Recording Rules Rule Group" rule_group_enabled = true interval = "PT1M" - scopes = [azurerm_monitor_workspace.amw.id] + scopes = [azurerm_monitor_workspace.amw.id,azurerm_kubernetes_cluster.k8s.id] rule { enabled = true @@ -209,7 +209,7 @@ resource "azurerm_monitor_alert_prometheus_rule_group" "kubernetes_recording_rul description = "Kubernetes Recording Rules Rule Group" rule_group_enabled = true interval = "PT1M" - scopes = [azurerm_monitor_workspace.amw.id] + scopes = [azurerm_monitor_workspace.amw.id,azurerm_kubernetes_cluster.k8s.id] rule { enabled = true @@ -366,7 +366,7 @@ resource "azurerm_monitor_alert_prometheus_rule_group" "node_and_kubernetes_reco description = "Node and Kubernetes Recording Rules Rule Group for Windows Nodes" rule_group_enabled = true interval = "PT1M" - scopes = [azurerm_monitor_workspace.amw.id] + scopes = [azurerm_monitor_workspace.amw.id,azurerm_kubernetes_cluster.k8s.id] rule { enabled = true @@ -497,7 +497,7 @@ resource "azurerm_monitor_alert_prometheus_rule_group" "node_recording_rules_rul description = "Node and Kubernetes Recording Rules Rule Group for Windows Nodes" rule_group_enabled = true interval = "PT1M" - scopes = [azurerm_monitor_workspace.amw.id] + scopes = [azurerm_monitor_workspace.amw.id,azurerm_kubernetes_cluster.k8s.id] rule { enabled = true