From 02a377232a371bbb6c211fa98ff7ddf46ada49fc Mon Sep 17 00:00:00 2001 From: Soham Dasgupta Date: Thu, 25 Jan 2024 11:56:14 -0800 Subject: [PATCH] Fix: Update Arm and Bicep templates for recommended metric alerts --- AddonBicepTemplate/README.md | 4 +- .../recommendedMetricAlerts.bicep | 906 +++++++++++++++++ ...mmendedMetricAlertsProfileParameters.json} | 2 +- .../Default/recommendedMetricAlerts.json | 939 ++++++++++++++++++ 4 files changed, 1848 insertions(+), 3 deletions(-) create mode 100644 AddonBicepTemplate/recommendedMetricAlerts.bicep rename AddonBicepTemplate/{AzureMonitorAlertsProfileParameters.json => recommendedMetricAlertsProfileParameters.json} (92%) create mode 100644 GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json diff --git a/AddonBicepTemplate/README.md b/AddonBicepTemplate/README.md index 85af60539..f45a42530 100644 --- a/AddonBicepTemplate/README.md +++ b/AddonBicepTemplate/README.md @@ -3,9 +3,9 @@ You can deploy the templates using a command like : ```az deployment group create -g -n --template-file ./FullAzureMonitorMetricsProfile.bicep --parameters ./FullAzureMonitorMetricsProfileParameters.json``` -In order to deploy community alerts and ci recommended alerts through template, deploy using command like: +In order to deploy recommended metric alerts through template, deploy using command like: -```az deployment group create -g -n --template-file .\AzureMonitorAlertsProfile.bicep --parameters .\AzureMonitorAlertsProfileParameters.json``` +```az deployment group create -g -n --template-file .\recommendedMetricAlerts.bicep --parameters .\recommendedMetricAlertsProfileParameters.json``` **NOTE** diff --git a/AddonBicepTemplate/recommendedMetricAlerts.bicep b/AddonBicepTemplate/recommendedMetricAlerts.bicep new file mode 100644 index 000000000..8111ca0a3 --- /dev/null +++ b/AddonBicepTemplate/recommendedMetricAlerts.bicep @@ -0,0 +1,906 @@ +param monitorWorkspaceName string +param location string +param aksResourceId string +param actionGroupResourceId string + +resource monitorWorkspace 'Microsoft.Monitor/accounts@2023-04-03' = { + name: monitorWorkspaceName + location: location + tags: {} + properties: {} +} + + +resource recommendedMetricAlerts 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { + name: 'RecommendedMetricAlerts-${split(aksResourceId, '/')[8]}' + location: location + properties: { + description: 'Kubernetes Alert RuleGroup-RecommendedMetricAlerts - 0.1' + scopes: [monitorWorkspace.id,aksResourceId] + clusterName: split(aksResourceId, '/')[8] + enabled: true + interval: 'PT5M' + rules: [ + { + alert: 'KubePodCrashLooping' + expression: 'max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1' + for: 'PT15M' + annotations: { + description: '{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) in {{ $labels.cluster}} is restarting {{ printf "%.2f" $value }} / second. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePodCrashLooping.md).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'Job did not complete in time' + expression: 'sum by(namespace,cluster)(kube_job_spec_completions{job="kube-state-metrics"}) - sum by(namespace,cluster)(kube_job_status_succeeded{job="kube-state-metrics"}) > 0 ' + for: 'PT360M' + annotations: { + description: 'Number of stale jobs older than six hours is greater than 0' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'Pod container restarted in the last 1 hour' + expression: 'sum by (namespace, controller, container, cluster)(increase(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[1h])* on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' + for: 'PT15M' + annotations: { + description: 'Pod container restarted in the last 1 hour' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'Ready state of pods is less than 80%. ' + expression: 'sum by (cluster,namespace,deployment)(kube_deployment_status_replicas_ready) / sum by (cluster,namespace,deployment)(kube_deployment_spec_replicas) <.8 or sum by (cluster,namespace,deployment)(kube_daemonset_status_number_ready) / sum by (cluster,namespace,deployment)(kube_daemonset_status_desired_number_scheduled) <.8 ' + for: 'PT5M' + annotations: { + description: 'Ready state of pods is less than 80%.' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'Number of pods in failed state are greater than 0.' + expression: 'sum by (cluster, namespace, controller) (kube_pod_status_phase{phase="failed"} * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' + for: 'PT5M' + annotations: { + description: 'Number of pods in failed state are greater than 0' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubePodNotReadyByController' + expression: 'sum by (namespace, controller, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} ) * on(namespace, pod, cluster) group_left(controller)label_replace(kube_pod_owner,"controller","$1","owner_name","(.*)")) > 0' + for: 'PT15M' + annotations: { + description: '{{ $labels.namespace }}/{{ $labels.pod }} in {{ $labels.cluster}} by controller is not ready. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePodNotReady.md).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeStatefulSetGenerationMismatch' + expression: 'kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"}' + for: 'PT15M' + annotations: { + description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetGenerationMismatch.md).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeJobNotCompleted' + expression: 'time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"} and kube_job_status_active{job="kube-state-metrics"} > 0) > 43200' + for: 'PT15M' + annotations: { + description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} is taking more than 12 hours to complete. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeJobCompletion.md).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeJobFailed' + expression: 'kube_job_failed{job="kube-state-metrics"} > 0' + for: 'PT15M' + annotations: { + description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} failed to complete. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeJobFailed.md).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'Average CPU usage per container is greater than 95%' + expression: 'sum (rate(container_cpu_usage_seconds_total{image!="", container!="POD"}[5m])) by (pod,cluster,container,namespace) / sum(container_spec_cpu_quota{image!="", container!="POD"}/container_spec_cpu_period{image!="", container!="POD"}) by (pod,cluster,container,namespace) > .95' + for: 'PT5M' + annotations: { + description: 'Average CPU usage per container is greater than 95%' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'Average Memory usage per container is greater than 95%.' + expression: 'avg by (namespace, controller, container, cluster)(((container_memory_working_set_bytes{container!="", image!="", container!="POD"} / on(namespace,cluster,pod,container) group_left kube_pod_container_resource_limits{resource="memory", node!=""})*on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > .95)' + for: 'PT10M' + annotations: { + description: 'Average Memory usage per container is greater than 95%' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeletPodStartUpLatencyHigh' + expression: 'histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet"} > 60' + for: 'PT10M' + annotations: { + description: 'Kubelet Pod startup latency is too high. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeletPodStartUpLatencyHigh.md)' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'Average PV usage is greater than 80%' + expression: 'avg by (namespace, controller, container, cluster)(((kubelet_volume_stats_used_bytes{job="kubelet"} / on(namespace,cluster,pod,container) group_left kubelet_volume_stats_capacity_bytes{job="kubelet"}) * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > .8' + for: 'PT15M' + annotations: { + description: 'Average PV usage on pod {{ $labels.pod }} in container {{ $labels.container }} is greater than 80%' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeDeploymentReplicasMismatch' + expression: '( kube_deployment_spec_replicas{job="kube-state-metrics"} > kube_deployment_status_replicas_available{job="kube-state-metrics"}) and ( changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) == 0)' + for: 'PT15M' + annotations: { + description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeDeploymentReplicasMismatch.md)' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeStatefulSetReplicasMismatch' + expression: '( kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_status_replicas{job="kube-state-metrics"}) and ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) == 0)' + for: 'PT15M' + annotations: { + description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetReplicasMismatch.md)' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeHpaReplicasMismatch' + expression: '(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} !=kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) and(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} >kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) and(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} 1.5' + for: 'PT5M' + annotations: { + description: 'Cluster {{ $labels.cluster}} has overcommitted CPU resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeCPUQuotaOvercommit.md)' + } + labels: { + severity: 'warning' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeMemoryQuotaOvercommit' + expression: 'sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) /sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 1.5' + for: 'PT5M' + annotations: { + description: 'Cluster {{ $labels.cluster}} has overcommitted memory resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeMemoryQuotaOvercommit.md)' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeVersionMismatch' + expression: 'count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1' + for: 'PT15M' + annotations: { + description: 'There are {{ $value }} different versions of Kubernetes components running in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeVersionMismatch.md)' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeClientErrors' + expression: '(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (cluster, instance, job, namespace) / sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace)) > 0.01' + for: 'PT15M' + annotations: { + description: 'Kubernetes API server client \'{{ $labels.job }}/{{ $labels.instance }}\' is experiencing {{ $value | humanizePercentage }} errors. Please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeClientErrors.md)' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubePersistentVolumeFillingUp' + expression: 'kubelet_volume_stats_available_bytes{job="kubelet"}/kubelet_volume_stats_capacity_bytes{job="kubelet"} < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1' + for: 'PT60M' + annotations: { + description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeFillingUp.md)' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubePersistentVolumeInodesFillingUp' + expression: 'kubelet_volume_stats_inodes_free{job="kubelet"} / kubelet_volume_stats_inodes{job="kubelet"} < 0.03' + for: 'PT15M' + annotations: { + description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes.' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubePersistentVolumeErrors' + expression: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0' + for: 'PT05M' + annotations: { + description: 'The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeErrors.md)' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'Average node CPU utilization is greater than 80%' + expression: '( (1 - rate(node_cpu_seconds_total{job="node", mode="idle"}[5m]) ) / ignoring(cpu) group_left count without (cpu)( node_cpu_seconds_total{job="node", mode="idle"}) ) > .8 ' + for: 'PT5M' + annotations: { + description: 'Average node CPU utilization is greater than 80%' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'Working set memory for a node is greater than 80%.' + expression: '1 - avg by (namespace, cluster, job, node)(label_replace(node_memory_MemAvailable_bytes{job="node"}, "node", "$1", "instance", "(.*)")) / avg by (namespace, cluster, job, node)(label_replace(node_memory_MemTotal_bytes{job="node"}, "node", "$1", "instance", "(.*)")) > .8' + for: 'PT05M' + annotations: { + description: 'Working set memory for a node is greater than 80%.' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'Number of OOM killed containers is greater than 0' + expression: 'sum by (cluster,container,controller,namespace)(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} * on(cluster,namespace,pod) group_left(controller) label_replace(kube_pod_owner, "controller", "$1", "owner_name", "(.*)")) > 0' + for: 'PT5M' + annotations: { + description: 'Number of OOM killed containers is greater than 0' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeNodeUnreachable' + expression: '(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1' + for: 'PT15M' + annotations: { + description: '{{ $labels.node }} in {{ $labels.cluster}} is unreachable and some workloads may be rescheduled. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeUnreachable.md).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeNodeNotReady' + expression: 'kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0' + for: 'PT15M' + annotations: { + description: '{{ $labels.node }} in {{ $labels.cluster}} has been unready for more than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeNotReady.md).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeNodeReadinessFlapping' + expression: 'sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (cluster, node) > 2' + for: 'PT15M' + annotations: { + description: 'The readiness status of node {{ $labels.node }} in {{ $labels.cluster}} has changed more than 2 times in the last 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeReadinessFlapping.md).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeContainerWaiting' + expression: 'sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0' + for: 'PT60M' + annotations: { + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeDaemonSetNotScheduled' + expression: 'kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0' + for: 'PT15M' + annotations: { + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeDaemonSetMisScheduled' + expression: 'kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0' + for: 'PT15M' + annotations: { + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeletClientCertificateExpiration' + expression: 'kubelet_certificate_manager_client_ttl_seconds < 7 * 24 * 3600' + for: 'PT5M' + annotations: { + description: 'Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT15M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeletServerCertificateExpiration' + expression: 'kubelet_certificate_manager_server_ttl_seconds < 7 * 24 * 3600' + for: 'PT10M' + annotations: { + description: 'Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeletClientCertificateRenewalErrors' + expression: 'increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0' + for: 'PT15M' + annotations: { + description: 'Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeletServerCertificateRenewalErrors' + expression: 'increase(kubelet_server_expiration_renew_errors[5m]) > 0' + for: 'PT15M' + annotations: { + description: 'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeQuotaAlmostFull' + expression: 'kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type)(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 0.9 < 1' + for: 'PT15M' + annotations: { + description: '{{ $value | humanizePercentage }} usage of {{ $labels.resource }} in namespace {{ $labels.namespace }} in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeQuotaAlmostFull.md).' + } + enabled: true + severity: 3 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeQuotaFullyUsed' + expression: 'kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) == 1' + for: 'PT15M' + annotations: { + description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + { + alert: 'KubeQuotaExceeded' + expression: 'kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 1' + for: 'PT15M' + annotations: { + description: 'Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).' + } + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + labels: { + severity: 'warning' + } + actions: [ + { + actionGroupId: actionGroupResourceId + } + ] + } + ] + } +} diff --git a/AddonBicepTemplate/AzureMonitorAlertsProfileParameters.json b/AddonBicepTemplate/recommendedMetricAlertsProfileParameters.json similarity index 92% rename from AddonBicepTemplate/AzureMonitorAlertsProfileParameters.json rename to AddonBicepTemplate/recommendedMetricAlertsProfileParameters.json index 5a82fc6ae..1cd71cb31 100644 --- a/AddonBicepTemplate/AzureMonitorAlertsProfileParameters.json +++ b/AddonBicepTemplate/recommendedMetricAlertsProfileParameters.json @@ -3,7 +3,7 @@ "contentVersion": "1.0.0.0", "parameters": { "monitorWorkspaceName": { - "value": "/subscriptions/{sub_id}/resourceGroups/{rg_name}/providers/microsoft.monitor/accounts/{cluster_name}" + "value": "/subscriptions/{sub_id}/resourceGroups/{rg_name}/providers/microsoft.monitor/accounts/{amw_name}" }, "location": { "value": "{amwLocation}" diff --git a/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json b/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json new file mode 100644 index 000000000..98261eee6 --- /dev/null +++ b/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json @@ -0,0 +1,939 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterName": { + "type": "string", + "metadata": { + "description": "Cluster name" + } + }, + "clusterResourceId": { + "type": "string", + "metadata": { + "description": "Cluster Resource Id" + } + }, + "actionGroupResourceId": { + "type": "string", + "metadata": { + "description": "Action Group ResourceId" + } + }, + "azureMonitorWorkspaceResourceId": { + "type": "string", + "metadata": { + "description": "ResourceId of Azure Monitor Workspace (AMW) to associate to" + } + }, + "location": { + "type": "string", + "defaultValue": "[resourceGroup().location]" + } + }, + "variables": { + "kubernetesAlertRuleGroup": "KubernetesAlert-RecommendedMetricAlerts", + "kubernetesAlertRuleGroupName": "[concat(variables('kubernetesAlertRuleGroup'), parameters('clusterName'))]", + "kubernetesAlertRuleGroupDescription": "Kubernetes Alert RuleGroup-RecommendedMetricAlerts", + "version": " - 0.1" + }, + "resources": [ + { + "name": "[variables('kubernetesAlertRuleGroupName')]", + "type": "Microsoft.AlertsManagement/prometheusRuleGroups", + "apiVersion": "2023-03-01", + "location": "[parameters('location')]", + "properties": { + "description": "[concat(variables('kubernetesAlertRuleGroupDescription'), variables('version'))]", + "scopes": [ + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" + ], + "clusterName": "[parameters('clusterName')]", + "interval": "PT1M", + "rules": [ + { + "alert": "KubePodCrashLooping", + "expression": "max_over_time(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", job=\"kube-state-metrics\"}[5m]) >= 1", + "for": "PT15M", + "annotations": { + "description": "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) in {{ $labels.cluster}} is restarting {{ printf \"%.2f\" $value }} / second. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePodCrashLooping.md)." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Job did not complete in time", + "expression": "sum by(namespace,cluster)(kube_job_spec_completions{job=\"kube-state-metrics\"}) - sum by(namespace,cluster)(kube_job_status_succeeded{job=\"kube-state-metrics\"}) > 0 ", + "for": "PT360M", + "annotations": { + "description": "Number of stale jobs older than six hours is greater than 0" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Pod container restarted in the last 1 hour", + "expression": "sum by (namespace, controller, container, cluster)(increase(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[1h])* on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > 0", + "for": "PT15M", + "annotations": { + "description": "Pod container restarted in the last 1 hour" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Ready state of pods is less than 80%. ", + "expression": "sum by (cluster,namespace,deployment)(kube_deployment_status_replicas_ready) / sum by (cluster,namespace,deployment)(kube_deployment_spec_replicas) <.8 or sum by (cluster,namespace,deployment)(kube_daemonset_status_number_ready) / sum by (cluster,namespace,deployment)(kube_daemonset_status_desired_number_scheduled) <.8 ", + "for": "PT5M", + "annotations": { + "description": "Ready state of pods is less than 80%." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Number of pods in failed state are greater than 0.", + "expression": "sum by (cluster, namespace, controller) (kube_pod_status_phase{phase=\"failed\"} * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > 0", + "for": "PT5M", + "annotations": { + "description": "Number of pods in failed state are greater than 0" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubePodNotReadyByController", + "expression": "sum by (namespace, controller, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"} ) * on(namespace, pod, cluster) group_left(controller)label_replace(kube_pod_owner,\"controller\",\"$1\",\"owner_name\",\"(.*)\")) > 0", + "for": "PT15M", + "annotations": { + "description": "{{ $labels.namespace }}/{{ $labels.pod }} in {{ $labels.cluster}} by controller is not ready. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePodNotReady.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeStatefulSetGenerationMismatch", + "expression": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"} != kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}", + "for": "PT15M", + "annotations": { + "description": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetGenerationMismatch.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeJobNotCompleted", + "expression": "time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"} and kube_job_status_active{job=\"kube-state-metrics\"} > 0) > 43200", + "for": "PT15M", + "annotations": { + "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} is taking more than 12 hours to complete. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeJobCompletion.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeJobFailed", + "expression": "kube_job_failed{job=\"kube-state-metrics\"} > 0", + "for": "PT15M", + "annotations": { + "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} in {{ $labels.cluster}} failed to complete. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeJobFailed.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Average CPU usage per container is greater than 95%", + "expression": "sum (rate(container_cpu_usage_seconds_total{image!=\"\", container!=\"POD\"}[5m])) by (pod,cluster,container,namespace) / sum(container_spec_cpu_quota{image!=\"\", container!=\"POD\"}/container_spec_cpu_period{image!=\"\", container!=\"POD\"}) by (pod,cluster,container,namespace) > .95", + "for": "PT5M", + "annotations": { + "description": "Average CPU usage per container is greater than 95%" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Average Memory usage per container is greater than 95%.", + "expression": "avg by (namespace, controller, container, cluster)(((container_memory_working_set_bytes{container!=\"\", image!=\"\", container!=\"POD\"} / on(namespace,cluster,pod,container) group_left kube_pod_container_resource_limits{resource=\"memory\", node!=\"\"})*on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > .95)", + "for": "PT10M", + "annotations": { + "description": "Average Memory usage per container is greater than 95%" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeletPodStartUpLatencyHigh", + "expression": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job=\"kubelet\"} > 60", + "for": "PT10M", + "annotations": { + "description": "Kubelet Pod startup latency is too high. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeletPodStartUpLatencyHigh.md)" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Average PV usage is greater than 80%", + "expression": "avg by (namespace, controller, container, cluster)(((kubelet_volume_stats_used_bytes{job=\"kubelet\"} / on(namespace,cluster,pod,container) group_left kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}) * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > .8", + "for": "PT15M", + "annotations": { + "description": "Average PV usage on pod {{ $labels.pod }} in container {{ $labels.container }} is greater than 80%" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeDeploymentReplicasMismatch", + "expression": "( kube_deployment_spec_replicas{job=\"kube-state-metrics\"} > kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}) and ( changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m]) == 0)", + "for": "PT15M", + "annotations": { + "description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeDeploymentReplicasMismatch.md)" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeStatefulSetReplicasMismatch", + "expression": "( kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\"}) and ( changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m]) == 0)", + "for": "PT15M", + "annotations": { + "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetReplicasMismatch.md)" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeHpaReplicasMismatch", + "expression": "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"} !=kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}) and(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} >kube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"}) and(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} 1.5", + "for": "PT5M", + "annotations": { + "description": "Cluster {{ $labels.cluster}} has overcommitted CPU resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeCPUQuotaOvercommit.md)" + }, + "labels": { + "severity": "warning" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeMemoryQuotaOvercommit", + "expression": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"})) /sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) > 1.5", + "for": "PT5M", + "annotations": { + "description": "Cluster {{ $labels.cluster}} has overcommitted memory resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeMemoryQuotaOvercommit.md)" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeVersionMismatch", + "expression": "count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"))) > 1", + "for": "PT15M", + "annotations": { + "description": "There are {{ $value }} different versions of Kubernetes components running in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeVersionMismatch.md)" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeClientErrors", + "expression": "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (cluster, instance, job, namespace) / sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace)) > 0.01", + "for": "PT15M", + "annotations": { + "description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors. Please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeClientErrors.md)" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubePersistentVolumeFillingUp", + "expression": "kubelet_volume_stats_available_bytes{job=\"kubelet\"}/kubelet_volume_stats_capacity_bytes{job=\"kubelet\"} < 0.15 and kubelet_volume_stats_used_bytes{job=\"kubelet\"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1", + "for": "PT60M", + "annotations": { + "description": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeFillingUp.md)" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubePersistentVolumeInodesFillingUp", + "expression": "kubelet_volume_stats_inodes_free{job=\"kubelet\"} / kubelet_volume_stats_inodes{job=\"kubelet\"} < 0.03", + "for": "PT15M", + "annotations": { + "description": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubePersistentVolumeErrors", + "expression": "kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"} > 0", + "for": "PT05M", + "annotations": { + "description": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeErrors.md)" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Average node CPU utilization is greater than 80%", + "expression": "( (1 - rate(node_cpu_seconds_total{job=\"node\", mode=\"idle\"}[5m]) ) / ignoring(cpu) group_left count without (cpu)( node_cpu_seconds_total{job=\"node\", mode=\"idle\"}) ) > .8 ", + "for": "PT5M", + "annotations": { + "description": "Average node CPU utilization is greater than 80%" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Working set memory for a node is greater than 80%.", + "expression": "1 - avg by (namespace, cluster, job, node)(label_replace(node_memory_MemAvailable_bytes{job=\"node\"}, \"node\", \"$1\", \"instance\", \"(.*)\")) / avg by (namespace, cluster, job, node)(label_replace(node_memory_MemTotal_bytes{job=\"node\"}, \"node\", \"$1\", \"instance\", \"(.*)\")) > .8", + "for": "PT05M", + "annotations": { + "description": "Working set memory for a node is greater than 80%." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Number of OOM killed containers is greater than 0", + "expression": "sum by (cluster,container,controller,namespace)(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} * on(cluster,namespace,pod) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > 0", + "for": "PT5M", + "annotations": { + "description": "Number of OOM killed containers is greater than 0" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeNodeUnreachable", + "expression": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1", + "for": "PT15M", + "annotations": { + "description": "{{ $labels.node }} in {{ $labels.cluster}} is unreachable and some workloads may be rescheduled. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeUnreachable.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeNodeNotReady", + "expression": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0", + "for": "PT15M", + "annotations": { + "description": "{{ $labels.node }} in {{ $labels.cluster}} has been unready for more than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeNotReady.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeNodeReadinessFlapping", + "expression": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (cluster, node) > 2", + "for": "PT15M", + "annotations": { + "description": "The readiness status of node {{ $labels.node }} in {{ $labels.cluster}} has changed more than 2 times in the last 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeReadinessFlapping.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeContainerWaiting", + "expression": "sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0", + "for": "PT60M", + "annotations": { + "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeDaemonSetNotScheduled", + "expression": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"} - kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0", + "for": "PT15M", + "annotations": { + "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeDaemonSetMisScheduled", + "expression": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} > 0", + "for": "PT15M", + "annotations": { + "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeletClientCertificateExpiration", + "expression": "kubelet_certificate_manager_client_ttl_seconds < 7 * 24 * 3600", + "for": "PT5M", + "annotations": { + "description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeletServerCertificateExpiration", + "expression": "kubelet_certificate_manager_server_ttl_seconds < 7 * 24 * 3600", + "for": "PT10M", + "annotations": { + "description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeletClientCertificateRenewalErrors", + "expression": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0", + "for": "PT15M", + "annotations": { + "description": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes)." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeletServerCertificateRenewalErrors", + "expression": "increase(kubelet_server_expiration_renew_errors[5m]) > 0", + "for": "PT15M", + "annotations": { + "description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeQuotaAlmostFull", + "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type)(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) > 0.9 < 1", + "for": "PT15M", + "annotations": { + "description": "{{ $value | humanizePercentage }} usage of {{ $labels.resource }} in namespace {{ $labels.namespace }} in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeQuotaAlmostFull.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeQuotaFullyUsed", + "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) == 1", + "for": "PT15M", + "annotations": { + "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeQuotaExceeded", + "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) > 1", + "for": "PT15M", + "annotations": { + "description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + } + ] + } + } + ] +}