From fafaa3b43a818c37d717322b7b3c065f07905b9b Mon Sep 17 00:00:00 2001 From: Sohamdg081992 <31517098+Sohamdg081992@users.noreply.github.com> Date: Wed, 6 Mar 2024 15:23:17 -0800 Subject: [PATCH] Update templates with latest refinements (#763) This change has following: 1. Make them similar as current state of prod [manifest](https://msazure.visualstudio.com/One/_git/AzureMonitorResourceProviderOnboarding?path=/src/resourceproviders/microsoft.containerservice/managedclusters/prod/manifest.json) , basically removing some alerts according to the [PRD](https://microsoft.sharepoint.com/:w:/r/teams/azurecontainercompute/_layouts/15/Doc.aspx?sourcedoc=%7BD7733B7A-2BD1-4F09-95C7-9A114B4A8055%7D&file=Recommended%20Alerts%20PRD.docx&wdLOR=cF191ECF1-3BEC-4012-BC1F-C5CF1CBD4365&fromShare=true&action=default&mobileredirect=true) 2. Update description links to aka ms 3. Change long sentence alert names to Single words 4. Update the bicep template same as Arm template I will create a Tip PR in manifest for points 2 and 3 above. --- .trivyignore | 3 +- .../recommendedMetricAlerts.bicep | 503 +++++------------- .../Default/recommendedMetricAlerts.json | 272 ++-------- 3 files changed, 183 insertions(+), 595 deletions(-) diff --git a/.trivyignore b/.trivyignore index 018f886f4..658415de0 100644 --- a/.trivyignore +++ b/.trivyignore @@ -10,7 +10,7 @@ GHSA-fr2g-9hjm-wr23 # HIGH - promconfigvalidator # HIGH - telegraf -CVE-2023-39325 +CVE-2023-39325 GHSA-m425-mq94-257g CVE-2023-47090 CVE-2023-46129 @@ -25,6 +25,7 @@ CVE-2023-48795 GHSA-jq35-85cj-fj4p GHSA-7ww5-4wqc-m92c GHSA-mhpq-9638-x6pw +CVE-2023-50658 CVE-2023-48795 CVE-2023-3978 CVE-2023-44487 diff --git a/AddonBicepTemplate/recommendedMetricAlerts.bicep b/AddonBicepTemplate/recommendedMetricAlerts.bicep index 886cadd72..f474a3bba 100644 --- a/AddonBicepTemplate/recommendedMetricAlerts.bicep +++ b/AddonBicepTemplate/recommendedMetricAlerts.bicep @@ -10,143 +10,32 @@ resource monitorWorkspace 'Microsoft.Monitor/accounts@2023-04-03' = { properties: {} } - -resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { - name: 'RecommendedMetricAlerts-${split(aksResourceId, '/')[8]}' +resource recommendedMetricAlertsClusterLevel 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { + name: 'RecommendedMetricAlerts-Cluster-level-${split(aksResourceId, '/')[8]}' location: location properties: { - description: 'Kubernetes Alert RuleGroup-RecommendedMetricAlerts - 0.1' + description: 'Kubernetes Alert RuleGroup-RecommendedMetricAlerts-Cluster-level - 0.1' scopes: [monitorWorkspace.id,aksResourceId] clusterName: split(aksResourceId, '/')[8] enabled: true - interval: 'PT5M' + interval: 'PT1M' rules: [ { - alert: 'KubePodCrashLooping' - expression: 'max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1' - for: 'PT15M' - annotations: { - description: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) in {{ $labels.cluster}} is restarting {{ printf "%.2f" $value }} / second. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePodCrashLooping.md).' - } - enabled: true - severity: 4 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT10M' - } - labels: { - severity: 'warning' - } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'Job did not complete in time' - expression: 'sum by(namespace,cluster)(kube_job_spec_completions{job="kube-state-metrics"}) - sum by(namespace,cluster)(kube_job_status_succeeded{job="kube-state-metrics"}) > 0 ' - for: 'PT360M' - annotations: { - description: 'Number of stale jobs older than six hours is greater than 0' - } - enabled: true - severity: 4 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT15M' - } - labels: { - severity: 'warning' - } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'Pod container restarted in the last 1 hour' - expression: 'sum by (namespace, controller, container, cluster)(increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[1h])* on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' - for: 'PT15M' - annotations: { - description: 'Pod container restarted in the last 1 hour' - } - enabled: true - severity: 4 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT10M' - } - labels: { - severity: 'warning' - } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'Ready state of pods is less than 80%. ' - expression: 'sum by (cluster,namespace,deployment)(kube_deployment_status_replicas_ready) / sum by (cluster,namespace,deployment)(kube_deployment_spec_replicas) <.8 or sum by (cluster,namespace,deployment)(kube_daemonset_status_number_ready) / sum by (cluster,namespace,deployment)(kube_daemonset_status_desired_number_scheduled) <.8 ' - for: 'PT5M' - annotations: { - description: 'Ready state of pods is less than 80%.' - } - enabled: true - severity: 4 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT15M' - } - labels: { - severity: 'warning' - } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'Number of pods in failed state are greater than 0.' - expression: 'sum by (cluster, namespace, controller) (kube_pod_status_phase{phase="failed"} * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' + alert: 'KubeCPUQuotaOvercommit' + expression: 'sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) /sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) > 1.5' for: 'PT5M' annotations: { - description: 'Number of pods in failed state are greater than 0' - } - enabled: true - severity: 4 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT15M' + description: 'Cluster {{ $labels.cluster}} has overcommitted CPU resource requests for Namespaces. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } labels: { severity: 'warning' } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'KubePodNotReadyByController' - expression: 'sum by (namespace, controller, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} ) * on(namespace, pod, cluster) group_left(controller)label_replace(kube_pod_owner,"controller","$1","owner_name","(.*)")) > 0' - for: 'PT15M' - annotations: { - description: '{{ $labels.namespace }}/{{ $labels.pod }} in {{ $labels.cluster}} by controller is not ready. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePodNotReady.md).' - } enabled: true severity: 3 resolveConfiguration: { autoResolved: true timeToResolve: 'PT10M' } - labels: { - severity: 'warning' - } actions: [ { actionGroupId: actionGroupResourceId @@ -154,11 +43,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeStatefulSetGenerationMismatch' - expression: 'kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"}' - for: 'PT15M' + alert: 'KubeMemoryQuotaOvercommit' + expression: 'sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) /sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 1.5' + for: 'PT5M' annotations: { - description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetGenerationMismatch.md).' + description: 'Cluster {{ $labels.cluster}} has overcommitted memory resource requests for Namespaces. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } enabled: true severity: 3 @@ -176,14 +65,14 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeJobNotCompleted' - expression: 'time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"} and kube_job_status_active{job="kube-state-metrics"} > 0) > 43200' - for: 'PT15M' + alert: 'KubeContainerOOMKilledCount' + expression: 'sum by (cluster,container,controller,namespace)(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} * on(cluster,namespace,pod) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' + for: 'PT5M' annotations: { - description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} is taking more than 12 hours to complete. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeJobCompletion.md).' + description: 'Number of OOM killed containers is greater than 0. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } enabled: true - severity: 3 + severity: 4 resolveConfiguration: { autoResolved: true timeToResolve: 'PT10M' @@ -198,11 +87,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeJobFailed' - expression: 'kube_job_failed{job="kube-state-metrics"} > 0' + alert: 'KubeClientErrors' + expression: '(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (cluster, instance, job, namespace) / sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace)) > 0.01' for: 'PT15M' annotations: { - description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} failed to complete. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeJobFailed.md).' + description: 'Kubernetes API server client \'{{ $labels.job }}/{{ $labels.instance }}\' is experiencing {{ $value | humanizePercentage }} errors. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } enabled: true severity: 3 @@ -220,11 +109,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'Average CPU usage per container is greater than 95%' - expression: 'sum (rate(container_cpu_usage_seconds_total{image!="", container!="POD"}[5m])) by (pod,cluster,container,namespace) / sum(container_spec_cpu_quota{image!="", container!="POD"}/container_spec_cpu_period{image!="", container!="POD"}) by (pod,cluster,container,namespace) > .95' - for: 'PT5M' + alert: 'KubePersistentVolumeFillingUp' + expression: 'kubelet_volume_stats_available_bytes{job="kubelet"}/kubelet_volume_stats_capacity_bytes{job="kubelet"} < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1' + for: 'PT60M' annotations: { - description: 'Average CPU usage per container is greater than 95%' + description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } enabled: true severity: 4 @@ -242,11 +131,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'Average Memory usage per container is greater than 95%.' - expression: 'avg by (namespace, controller, container, cluster)(((container_memory_working_set_bytes{container!="", image!="", container!="POD"} / on(namespace,cluster,pod,container) group_left kube_pod_container_resource_limits{resource="memory", node!=""})*on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > .95)' - for: 'PT10M' + alert: 'KubePersistentVolumeInodesFillingUp' + expression: 'kubelet_volume_stats_inodes_free{job="kubelet"} / kubelet_volume_stats_inodes{job="kubelet"} < 0.03' + for: 'PT15M' annotations: { - description: 'Average Memory usage per container is greater than 95%' + description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } enabled: true severity: 4 @@ -264,11 +153,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeletPodStartUpLatencyHigh' - expression: 'histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"} > 60' - for: 'PT10M' + alert: 'KubePersistentVolumeErrors' + expression: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0' + for: 'PT05M' annotations: { - description: 'Kubelet Pod startup latency is too high. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeletPodStartUpLatencyHigh.md)' + description: 'The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } enabled: true severity: 4 @@ -286,11 +175,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'Average PV usage is greater than 80%' - expression: 'avg by (namespace, controller, container, cluster)(((kubelet_volume_stats_used_bytes{job="kubelet"} / on(namespace,cluster,pod,container) group_left kubelet_volume_stats_capacity_bytes{job="kubelet"}) * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)"))) > .8' - for: 'PT15M' + alert: 'KubeContainerWaiting' + expression: 'sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0' + for: 'PT60M' annotations: { - description: 'Average PV usage on pod {{ $labels.pod }} in container {{ $labels.container }} is greater than 80%' + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } enabled: true severity: 3 @@ -308,17 +197,17 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeDeploymentReplicasMismatch' - expression: '( kube_deployment_spec_replicas{job="kube-state-metrics"} > kube_deployment_status_replicas_available{job="kube-state-metrics"}) and ( changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) == 0)' + alert: 'KubeDaemonSetNotScheduled' + expression: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0' for: 'PT15M' annotations: { - description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeDeploymentReplicasMismatch.md)' + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } enabled: true - severity: 4 + severity: 3 resolveConfiguration: { autoResolved: true - timeToResolve: 'PT15M' + timeToResolve: 'PT10M' } labels: { severity: 'warning' @@ -330,14 +219,14 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeStatefulSetReplicasMismatch' - expression: '( kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"}) and ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) == 0)' + alert: 'KubeDaemonSetMisScheduled' + expression: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0' for: 'PT15M' annotations: { - description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetReplicasMismatch.md)' + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } enabled: true - severity: 4 + severity: 3 resolveConfiguration: { autoResolved: true timeToResolve: 'PT10M' @@ -352,17 +241,17 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeHpaReplicasMismatch' - expression: '(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} !=kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) and(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} >kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) and(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} 0) > 0.9 < 1' for: 'PT15M' annotations: { - description: 'Horizontal Pod Autoscaler in {{ $labels.cluster}} has not matched the desired number of replicas for longer than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeHpaReplicasMismatch.md)' + description: '{{ $value | humanizePercentage }} usage of {{ $labels.resource }} in namespace {{ $labels.namespace }} in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts).' } enabled: true - severity: 4 + severity: 3 resolveConfiguration: { autoResolved: true - timeToResolve: 'PT15M' + timeToResolve: 'PT10M' } labels: { severity: 'warning' @@ -373,56 +262,26 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup } ] } + ] + } +} + +resource recommendedMetricAlertsNodeLevel 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { + name: 'RecommendedMetricAlerts-Node-level-${split(aksResourceId, '/')[8]}' + location: location + properties: { + description: 'Kubernetes Alert RuleGroup-RecommendedMetricAlerts-Node-level - 0.1' + scopes: [monitorWorkspace.id,aksResourceId] + clusterName: split(aksResourceId, '/')[8] + enabled: true + interval: 'PT1M' + rules: [ { - alert: 'KubeHpaMaxedOut' - expression: 'kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} ==kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}' + alert: 'KubeNodeUnreachable' + expression: '(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1' for: 'PT15M' annotations: { - description: 'Horizontal Pod Autoscaler in {{ $labels.cluster}} has been running at max replicas for longer than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeHpaMaxedOut.md)' - } - enabled: true - severity: 4 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT15M' - } - labels: { - severity: 'warning' - } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'KubeCPUQuotaOvercommit' - expression: 'sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) /sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) > 1.5' - for: 'PT5M' - annotations: { - description: 'Cluster {{ $labels.cluster}} has overcommitted CPU resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeCPUQuotaOvercommit.md)' - } - labels: { - severity: 'warning' - } - enabled: true - severity: 3 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT10M' - } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'KubeMemoryQuotaOvercommit' - expression: 'sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) /sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 1.5' - for: 'PT5M' - annotations: { - description: 'Cluster {{ $labels.cluster}} has overcommitted memory resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeMemoryQuotaOvercommit.md)' + description: '{{ $labels.node }} in {{ $labels.cluster}} is unreachable and some workloads may be rescheduled. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/node-level-recommended-alerts).' } enabled: true severity: 3 @@ -440,11 +299,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeVersionMismatch' - expression: 'count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1' + alert: 'KubeNodeReadinessFlapping' + expression: 'sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (cluster, node) > 2' for: 'PT15M' annotations: { - description: 'There are {{ $value }} different versions of Kubernetes components running in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeVersionMismatch.md)' + description: 'The readiness status of node {{ $labels.node }} in {{ $labels.cluster}} has changed more than 2 times in the last 15 minutes. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/node-level-recommended-alerts).' } enabled: true severity: 3 @@ -461,15 +320,29 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup } ] } + ] + } +} + +resource recommendedMetricAlertsPodLevel 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { + name: 'RecommendedMetricAlerts-Pod-level-${split(aksResourceId, '/')[8]}' + location: location + properties: { + description: 'Kubernetes Alert RuleGroup-RecommendedMetricAlerts-Pod-level - 0.1' + scopes: [monitorWorkspace.id,aksResourceId] + clusterName: split(aksResourceId, '/')[8] + enabled: true + interval: 'PT1M' + rules: [ { - alert: 'KubeClientErrors' - expression: '(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (cluster, instance, job, namespace) / sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace)) > 0.01' + alert: 'KubePodCrashLooping' + expression: 'max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1' for: 'PT15M' annotations: { - description: 'Kubernetes API server client \'{{ $labels.job }}/{{ $labels.instance }}\' is experiencing {{ $value | humanizePercentage }} errors. Please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeClientErrors.md)' + description: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) in {{ $labels.cluster}} is restarting {{ printf "%.2f" $value }} / second. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true - severity: 3 + severity: 4 resolveConfiguration: { autoResolved: true timeToResolve: 'PT10M' @@ -484,11 +357,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubePersistentVolumeFillingUp' - expression: 'kubelet_volume_stats_available_bytes{job="kubelet"}/kubelet_volume_stats_capacity_bytes{job="kubelet"} < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1' - for: 'PT60M' + alert: 'KubeJobStale' + expression: 'sum by(namespace,cluster)(kube_job_spec_completions{job="kube-state-metrics"}) - sum by(namespace,cluster)(kube_job_status_succeeded{job="kube-state-metrics"}) > 0 ' + for: 'PT360M' annotations: { - description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeFillingUp.md)' + description: 'Number of stale jobs older than six hours is greater than 0. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 4 @@ -506,11 +379,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubePersistentVolumeInodesFillingUp' - expression: 'kubelet_volume_stats_inodes_free{job="kubelet"} / kubelet_volume_stats_inodes{job="kubelet"} < 0.03' + alert: 'KubePodContainerRestart' + expression: 'sum by (namespace, controller, container, cluster)(increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[1h])* on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' for: 'PT15M' annotations: { - description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes.' + description: 'Pod container restarted in the last 1 hour. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 4 @@ -528,55 +401,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubePersistentVolumeErrors' - expression: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0' - for: 'PT05M' - annotations: { - description: 'The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeErrors.md)' - } - enabled: true - severity: 4 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT10M' - } - labels: { - severity: 'warning' - } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'Average node CPU utilization is greater than 80%' - expression: '( (1 - rate(node_cpu_seconds_total{job="node", mode="idle"}[5m]) ) / ignoring(cpu) group_left count without (cpu)( node_cpu_seconds_total{job="node", mode="idle"}) ) > .8 ' + alert: 'KubePodReadyStateLow' + expression: 'sum by (cluster,namespace,deployment)(kube_deployment_status_replicas_ready) / sum by (cluster,namespace,deployment)(kube_deployment_spec_replicas) <.8 or sum by (cluster,namespace,deployment)(kube_daemonset_status_number_ready) / sum by (cluster,namespace,deployment)(kube_daemonset_status_desired_number_scheduled) <.8 ' for: 'PT5M' annotations: { - description: 'Average node CPU utilization is greater than 80%' - } - enabled: true - severity: 3 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT10M' - } - labels: { - severity: 'warning' - } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'Working set memory for a node is greater than 80%.' - expression: '1 - avg by (namespace, cluster, job, node)(label_replace(node_memory_MemAvailable_bytes{job="node"}, "node", "$1", "instance", "(.*)")) / avg by (namespace, cluster, job, node)(label_replace(node_memory_MemTotal_bytes{job="node"}, "node", "$1", "instance", "(.*)")) > .8' - for: 'PT05M' - annotations: { - description: 'Working set memory for a node is greater than 80%.' + description: 'Ready state of pods is less than 80%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 4 @@ -594,39 +423,17 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'Number of OOM killed containers is greater than 0' - expression: 'sum by (cluster,container,controller,namespace)(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} * on(cluster,namespace,pod) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' + alert: 'KubePodFailedState' + expression: 'sum by (cluster, namespace, controller) (kube_pod_status_phase{phase="failed"} * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' for: 'PT5M' annotations: { - description: 'Number of OOM killed containers is greater than 0' + description: 'Number of pods in failed state are greater than 0. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 4 resolveConfiguration: { autoResolved: true - timeToResolve: 'PT10M' - } - labels: { - severity: 'warning' - } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'KubeNodeUnreachable' - expression: '(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1' - for: 'PT15M' - annotations: { - description: '{{ $labels.node }} in {{ $labels.cluster}} is unreachable and some workloads may be rescheduled. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeUnreachable.md).' - } - enabled: true - severity: 3 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT10M' + timeToResolve: 'PT15M' } labels: { severity: 'warning' @@ -638,11 +445,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeNodeNotReady' - expression: 'kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0' + alert: 'KubePodNotReadyByController' + expression: 'sum by (namespace, controller, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} ) * on(namespace, pod, cluster) group_left(controller)label_replace(kube_pod_owner,"controller","$1","owner_name","(.*)")) > 0' for: 'PT15M' annotations: { - description: '{{ $labels.node }} in {{ $labels.cluster}} has been unready for more than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeNotReady.md).' + description: '{{ $labels.namespace }}/{{ $labels.pod }} in {{ $labels.cluster}} by controller is not ready. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 3 @@ -660,11 +467,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeNodeReadinessFlapping' - expression: 'sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (cluster, node) > 2' + alert: 'KubeStatefulSetGenerationMismatch' + expression: 'kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"}' for: 'PT15M' annotations: { - description: 'The readiness status of node {{ $labels.node }} in {{ $labels.cluster}} has changed more than 2 times in the last 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeReadinessFlapping.md).' + description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 3 @@ -682,33 +489,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeContainerWaiting' - expression: 'sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0' - for: 'PT60M' - annotations: { - description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.' - } - enabled: true - severity: 3 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT10M' - } - labels: { - severity: 'warning' - } - actions: [ - { - actionGroupId: actionGroupResourceId - } - ] - } - { - alert: 'KubeDaemonSetNotScheduled' - expression: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0' + alert: 'KubeJobFailed' + expression: 'kube_job_failed{job="kube-state-metrics"} > 0' for: 'PT15M' annotations: { - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' + description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} failed to complete. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 3 @@ -726,17 +511,17 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeDaemonSetMisScheduled' - expression: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0' - for: 'PT15M' + alert: 'KubeContainerAverageCPUHigh' + expression: 'sum (rate(container_cpu_usage_seconds_total{image!="", container!="POD"}[5m])) by (pod,cluster,container,namespace) / sum(container_spec_cpu_quota{image!="", container!="POD"}/container_spec_cpu_period{image!="", container!="POD"}) by (pod,cluster,container,namespace) > .95' + for: 'PT5M' annotations: { - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' + description: 'Average CPU usage per container is greater than 95%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true - severity: 3 + severity: 4 resolveConfiguration: { autoResolved: true - timeToResolve: 'PT10M' + timeToResolve: 'PT15M' } labels: { severity: 'warning' @@ -748,17 +533,17 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeletClientCertificateExpiration' - expression: 'kubelet_certificate_manager_client_ttl_seconds < 7 * 24 * 3600' - for: 'PT5M' + alert: 'KubeContainerAverageMemoryHigh' + expression: 'avg by (namespace, controller, container, cluster)(((container_memory_working_set_bytes{container!="", image!="", container!="POD"} / on(namespace,cluster,pod,container) group_left kube_pod_container_resource_limits{resource="memory", node!=""})*on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > .95)' + for: 'PT10M' annotations: { - description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.' + description: 'Average Memory usage per container is greater than 95%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 4 resolveConfiguration: { autoResolved: true - timeToResolve: 'PT15M' + timeToResolve: 'PT10M' } labels: { severity: 'warning' @@ -770,11 +555,11 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeletServerCertificateExpiration' - expression: 'kubelet_certificate_manager_server_ttl_seconds < 7 * 24 * 3600' + alert: 'KubeletPodStartUpLatencyHigh' + expression: 'histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"} > 60' for: 'PT10M' annotations: { - description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.' + description: 'Kubelet Pod startup latency is too high. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 4 @@ -792,14 +577,14 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeletClientCertificateRenewalErrors' - expression: 'increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0' + alert: 'KubePVUsageHigh' + expression: 'avg by (namespace, controller, container, cluster)(((kubelet_volume_stats_used_bytes{job="kubelet"} / on(namespace,cluster,pod,container) group_left kubelet_volume_stats_capacity_bytes{job="kubelet"}) * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)"))) > .8' for: 'PT15M' annotations: { - description: 'Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).' + description: 'Average PV usage on pod {{ $labels.pod }} in container {{ $labels.container }} is greater than 80%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true - severity: 4 + severity: 3 resolveConfiguration: { autoResolved: true timeToResolve: 'PT10M' @@ -814,17 +599,17 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeletServerCertificateRenewalErrors' - expression: 'increase(kubelet_server_expiration_renew_errors[5m]) > 0' + alert: 'KubeDeploymentReplicasMismatch' + expression: '( kube_deployment_spec_replicas{job="kube-state-metrics"} > kube_deployment_status_replicas_available{job="kube-state-metrics"}) and ( changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) == 0)' for: 'PT15M' annotations: { - description: 'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).' + description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 4 resolveConfiguration: { autoResolved: true - timeToResolve: 'PT10M' + timeToResolve: 'PT15M' } labels: { severity: 'warning' @@ -836,14 +621,14 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeQuotaAlmostFull' - expression: 'kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type)(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 0.9 < 1' + alert: 'KubeStatefulSetReplicasMismatch' + expression: '( kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"}) and ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) == 0)' for: 'PT15M' annotations: { - description: '{{ $value | humanizePercentage }} usage of {{ $labels.resource }} in namespace {{ $labels.namespace }} in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeQuotaAlmostFull.md).' + description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true - severity: 3 + severity: 4 resolveConfiguration: { autoResolved: true timeToResolve: 'PT10M' @@ -858,17 +643,17 @@ resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroup ] } { - alert: 'KubeQuotaFullyUsed' - expression: 'kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) == 1' + alert: 'KubeHpaReplicasMismatch' + expression: '(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} !=kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) and(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} >kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) and(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} 0) > 1' + alert: 'KubeHpaMaxedOut' + expression: 'kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} ==kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}' for: 'PT15M' annotations: { - description: 'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).' + description: 'Horizontal Pod Autoscaler in {{ $labels.cluster}} has been running at max replicas for longer than 15 minutes. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts).' } enabled: true severity: 4 resolveConfiguration: { autoResolved: true - timeToResolve: 'PT10M' + timeToResolve: 'PT15M' } labels: { severity: 'warning' diff --git a/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json b/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json index 8cb401fd9..0ef8d01ba 100644 --- a/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json +++ b/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json @@ -57,7 +57,7 @@ "expression": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(cpu|requests.cpu)\"})) /sum(kube_node_status_allocatable{resource=\"cpu\", job=\"kube-state-metrics\"}) > 1.5", "for": "PT5M", "annotations": { - "description": "Cluster {{ $labels.cluster}} has overcommitted CPU resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeCPUQuotaOvercommit.md)" + "description": "Cluster {{ $labels.cluster}} has overcommitted CPU resource requests for Namespaces. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "labels": { "severity": "warning" @@ -79,7 +79,7 @@ "expression": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"})) /sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) > 1.5", "for": "PT5M", "annotations": { - "description": "Cluster {{ $labels.cluster}} has overcommitted memory resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeMemoryQuotaOvercommit.md)" + "description": "Cluster {{ $labels.cluster}} has overcommitted memory resource requests for Namespaces. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -97,33 +97,11 @@ ] }, { - "alert": "KubeVersionMismatch", - "expression": "count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"))) > 1", - "for": "PT15M", - "annotations": { - "description": "There are {{ $value }} different versions of Kubernetes components running in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeVersionMismatch.md)" - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "Number of OOM killed containers is greater than 0", + "alert": "KubeContainerOOMKilledCount", "expression": "sum by (cluster,container,controller,namespace)(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} * on(cluster,namespace,pod) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > 0", "for": "PT5M", "annotations": { - "description": "Number of OOM killed containers is greater than 0" + "description": "Number of OOM killed containers is greater than 0. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -145,7 +123,7 @@ "expression": "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (cluster, instance, job, namespace) / sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace)) > 0.01", "for": "PT15M", "annotations": { - "description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors. Please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeClientErrors.md)" + "description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -167,7 +145,7 @@ "expression": "kubelet_volume_stats_available_bytes{job=\"kubelet\"}/kubelet_volume_stats_capacity_bytes{job=\"kubelet\"} < 0.15 and kubelet_volume_stats_used_bytes{job=\"kubelet\"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1", "for": "PT60M", "annotations": { - "description": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeFillingUp.md)" + "description": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -189,7 +167,7 @@ "expression": "kubelet_volume_stats_inodes_free{job=\"kubelet\"} / kubelet_volume_stats_inodes{job=\"kubelet\"} < 0.03", "for": "PT15M", "annotations": { - "description": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes." + "description": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -211,7 +189,7 @@ "expression": "kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"} > 0", "for": "PT05M", "annotations": { - "description": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeErrors.md)" + "description": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -233,7 +211,7 @@ "expression": "sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0", "for": "PT60M", "annotations": { - "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour." + "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -255,7 +233,7 @@ "expression": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"} - kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0", "for": "PT15M", "annotations": { - "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled." + "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -277,7 +255,7 @@ "expression": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} > 0", "for": "PT15M", "annotations": { - "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run." + "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -294,100 +272,12 @@ } ] }, - { - "alert": "KubeletClientCertificateExpiration", - "expression": "kubelet_certificate_manager_client_ttl_seconds < 7 * 24 * 3600", - "for": "PT5M", - "annotations": { - "description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT15M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeletServerCertificateExpiration", - "expression": "kubelet_certificate_manager_server_ttl_seconds < 7 * 24 * 3600", - "for": "PT10M", - "annotations": { - "description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeletClientCertificateRenewalErrors", - "expression": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0", - "for": "PT15M", - "annotations": { - "description": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes)." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeletServerCertificateRenewalErrors", - "expression": "increase(kubelet_server_expiration_renew_errors[5m]) > 0", - "for": "PT15M", - "annotations": { - "description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, { "alert": "KubeQuotaAlmostFull", "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type)(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) > 0.9 < 1", "for": "PT15M", "annotations": { - "description": "{{ $value | humanizePercentage }} usage of {{ $labels.resource }} in namespace {{ $labels.namespace }} in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeQuotaAlmostFull.md)." + "description": "{{ $value | humanizePercentage }} usage of {{ $labels.resource }} in namespace {{ $labels.namespace }} in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/cluster-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -403,50 +293,6 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { - "alert": "KubeQuotaFullyUsed", - "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) == 1", - "for": "PT15M", - "annotations": { - "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeQuotaExceeded", - "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) > 1", - "for": "PT15M", - "annotations": { - "description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] } ] } @@ -470,7 +316,7 @@ "expression": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1", "for": "PT15M", "annotations": { - "description": "{{ $labels.node }} in {{ $labels.cluster}} is unreachable and some workloads may be rescheduled. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeUnreachable.md)." + "description": "{{ $labels.node }} in {{ $labels.cluster}} is unreachable and some workloads may be rescheduled. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/node-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -487,34 +333,12 @@ } ] }, - { - "alert": "KubeNodeNotReady", - "expression": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0", - "for": "PT15M", - "annotations": { - "description": "{{ $labels.node }} in {{ $labels.cluster}} has been unready for more than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeNotReady.md)." - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, { "alert": "KubeNodeReadinessFlapping", "expression": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (cluster, node) > 2", "for": "PT15M", "annotations": { - "description": "The readiness status of node {{ $labels.node }} in {{ $labels.cluster}} has changed more than 2 times in the last 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeReadinessFlapping.md)." + "description": "The readiness status of node {{ $labels.node }} in {{ $labels.cluster}} has changed more than 2 times in the last 15 minutes. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/node-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -553,7 +377,7 @@ "expression": "max_over_time(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", job=\"kube-state-metrics\"}[5m]) >= 1", "for": "PT15M", "annotations": { - "description": "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) in {{ $labels.cluster}} is restarting {{ printf \"%.2f\" $value }} / second. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePodCrashLooping.md)." + "description": "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) in {{ $labels.cluster}} is restarting {{ printf \"%.2f\" $value }} / second. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -571,11 +395,11 @@ ] }, { - "alert": "Job did not complete in time", + "alert": "KubeJobStale", "expression": "sum by(namespace,cluster)(kube_job_spec_completions{job=\"kube-state-metrics\"}) - sum by(namespace,cluster)(kube_job_status_succeeded{job=\"kube-state-metrics\"}) > 0 ", "for": "PT360M", "annotations": { - "description": "Number of stale jobs older than six hours is greater than 0" + "description": "Number of stale jobs older than six hours is greater than 0. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -593,11 +417,11 @@ ] }, { - "alert": "Pod container restarted in the last 1 hour", + "alert": "KubePodContainerRestart", "expression": "sum by (namespace, controller, container, cluster)(increase(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[1h])* on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > 0", "for": "PT15M", "annotations": { - "description": "Pod container restarted in the last 1 hour" + "description": "Pod container restarted in the last 1 hour. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -615,11 +439,11 @@ ] }, { - "alert": "Ready state of pods is less than 80%. ", + "alert": "KubePodReadyStateLow", "expression": "sum by (cluster,namespace,deployment)(kube_deployment_status_replicas_ready) / sum by (cluster,namespace,deployment)(kube_deployment_spec_replicas) <.8 or sum by (cluster,namespace,deployment)(kube_daemonset_status_number_ready) / sum by (cluster,namespace,deployment)(kube_daemonset_status_desired_number_scheduled) <.8 ", "for": "PT5M", "annotations": { - "description": "Ready state of pods is less than 80%." + "description": "Ready state of pods is less than 80%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -637,11 +461,11 @@ ] }, { - "alert": "Number of pods in failed state are greater than 0.", + "alert": "KubePodFailedState", "expression": "sum by (cluster, namespace, controller) (kube_pod_status_phase{phase=\"failed\"} * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > 0", "for": "PT5M", "annotations": { - "description": "Number of pods in failed state are greater than 0" + "description": "Number of pods in failed state are greater than 0. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -663,7 +487,7 @@ "expression": "sum by (namespace, controller, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"} ) * on(namespace, pod, cluster) group_left(controller)label_replace(kube_pod_owner,\"controller\",\"$1\",\"owner_name\",\"(.*)\")) > 0", "for": "PT15M", "annotations": { - "description": "{{ $labels.namespace }}/{{ $labels.pod }} in {{ $labels.cluster}} by controller is not ready. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePodNotReady.md)." + "description": "{{ $labels.namespace }}/{{ $labels.pod }} in {{ $labels.cluster}} by controller is not ready. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -685,29 +509,7 @@ "expression": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"} != kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}", "for": "PT15M", "annotations": { - "description": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetGenerationMismatch.md)." - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeJobNotCompleted", - "expression": "time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"} and kube_job_status_active{job=\"kube-state-metrics\"} > 0) > 43200", - "for": "PT15M", - "annotations": { - "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} is taking more than 12 hours to complete. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeJobCompletion.md)." + "description": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -729,7 +531,7 @@ "expression": "kube_job_failed{job=\"kube-state-metrics\"} > 0", "for": "PT15M", "annotations": { - "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} failed to complete. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeJobFailed.md)." + "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} failed to complete. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -747,11 +549,11 @@ ] }, { - "alert": "Average CPU usage per container is greater than 95%", + "alert": "KubeContainerAverageCPUHigh", "expression": "sum (rate(container_cpu_usage_seconds_total{image!=\"\", container!=\"POD\"}[5m])) by (pod,cluster,container,namespace) / sum(container_spec_cpu_quota{image!=\"\", container!=\"POD\"}/container_spec_cpu_period{image!=\"\", container!=\"POD\"}) by (pod,cluster,container,namespace) > .95", "for": "PT5M", "annotations": { - "description": "Average CPU usage per container is greater than 95%" + "description": "Average CPU usage per container is greater than 95%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -769,11 +571,11 @@ ] }, { - "alert": "Average Memory usage per container is greater than 95%.", + "alert": "KubeContainerAverageMemoryHigh", "expression": "avg by (namespace, controller, container, cluster)(((container_memory_working_set_bytes{container!=\"\", image!=\"\", container!=\"POD\"} / on(namespace,cluster,pod,container) group_left kube_pod_container_resource_limits{resource=\"memory\", node!=\"\"})*on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > .95)", "for": "PT10M", "annotations": { - "description": "Average Memory usage per container is greater than 95%" + "description": "Average Memory usage per container is greater than 95%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -795,7 +597,7 @@ "expression": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job=\"kubelet\"} > 60", "for": "PT10M", "annotations": { - "description": "Kubelet Pod startup latency is too high. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeletPodStartUpLatencyHigh.md)" + "description": "Kubelet Pod startup latency is too high. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -813,11 +615,11 @@ ] }, { - "alert": "Average PV usage is greater than 80%", + "alert": "KubePVUsageHigh", "expression": "avg by (namespace, controller, container, cluster)(((kubelet_volume_stats_used_bytes{job=\"kubelet\"} / on(namespace,cluster,pod,container) group_left kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}) * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\"))) > .8", "for": "PT15M", "annotations": { - "description": "Average PV usage on pod {{ $labels.pod }} in container {{ $labels.container }} is greater than 80%" + "description": "Average PV usage on pod {{ $labels.pod }} in container {{ $labels.container }} is greater than 80%. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 3, @@ -839,7 +641,7 @@ "expression": "( kube_deployment_spec_replicas{job=\"kube-state-metrics\"} > kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}) and ( changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m]) == 0)", "for": "PT15M", "annotations": { - "description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeDeploymentReplicasMismatch.md)" + "description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -861,7 +663,7 @@ "expression": "( kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\"}) and ( changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m]) == 0)", "for": "PT15M", "annotations": { - "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetReplicasMismatch.md)" + "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://aka.ms/aks-alerts/pod-level-recommended-alerts)." }, "enabled": true, "severity": 4, @@ -883,7 +685,7 @@ "expression": "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"} !=kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}) and(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} >kube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"}) and(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}