diff --git a/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json b/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json index 98261eee6..8d0b2b304 100644 --- a/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json +++ b/GeneratedMonitoringArtifacts/Default/recommendedMetricAlerts.json @@ -39,7 +39,7 @@ }, "resources": [ { - "name": "[variables('kubernetesAlertRuleGroupName')]", + "name": "[concat(variables('kubernetesAlertRuleGroupName'), '-Cluster-level')]", "type": "Microsoft.AlertsManagement/prometheusRuleGroups", "apiVersion": "2023-03-01", "location": "[parameters('location')]", @@ -52,7 +52,503 @@ "clusterName": "[parameters('clusterName')]", "interval": "PT1M", "rules": [ - { + { + "alert": "KubeCPUQuotaOvercommit", + "expression": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(cpu|requests.cpu)\"})) /sum(kube_node_status_allocatable{resource=\"cpu\", job=\"kube-state-metrics\"}) > 1.5", + "for": "PT5M", + "annotations": { + "description": "Cluster {{ $labels.cluster}} has overcommitted CPU resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeCPUQuotaOvercommit.md)" + }, + "labels": { + "severity": "warning" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeMemoryQuotaOvercommit", + "expression": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"})) /sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) > 1.5", + "for": "PT5M", + "annotations": { + "description": "Cluster {{ $labels.cluster}} has overcommitted memory resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeMemoryQuotaOvercommit.md)" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeVersionMismatch", + "expression": "count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"))) > 1", + "for": "PT15M", + "annotations": { + "description": "There are {{ $value }} different versions of Kubernetes components running in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeVersionMismatch.md)" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "Number of OOM killed containers is greater than 0", + "expression": "sum by (cluster,container,controller,namespace)(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} * on(cluster,namespace,pod) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > 0", + "for": "PT5M", + "annotations": { + "description": "Number of OOM killed containers is greater than 0" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeClientErrors", + "expression": "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (cluster, instance, job, namespace) / sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace)) > 0.01", + "for": "PT15M", + "annotations": { + "description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors. Please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeClientErrors.md)" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubePersistentVolumeFillingUp", + "expression": "kubelet_volume_stats_available_bytes{job=\"kubelet\"}/kubelet_volume_stats_capacity_bytes{job=\"kubelet\"} < 0.15 and kubelet_volume_stats_used_bytes{job=\"kubelet\"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1", + "for": "PT60M", + "annotations": { + "description": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeFillingUp.md)" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubePersistentVolumeInodesFillingUp", + "expression": "kubelet_volume_stats_inodes_free{job=\"kubelet\"} / kubelet_volume_stats_inodes{job=\"kubelet\"} < 0.03", + "for": "PT15M", + "annotations": { + "description": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubePersistentVolumeErrors", + "expression": "kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"} > 0", + "for": "PT05M", + "annotations": { + "description": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeErrors.md)" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeContainerWaiting", + "expression": "sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0", + "for": "PT60M", + "annotations": { + "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeDaemonSetNotScheduled", + "expression": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"} - kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0", + "for": "PT15M", + "annotations": { + "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeDaemonSetMisScheduled", + "expression": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} > 0", + "for": "PT15M", + "annotations": { + "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeletClientCertificateExpiration", + "expression": "kubelet_certificate_manager_client_ttl_seconds < 7 * 24 * 3600", + "for": "PT5M", + "annotations": { + "description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeletServerCertificateExpiration", + "expression": "kubelet_certificate_manager_server_ttl_seconds < 7 * 24 * 3600", + "for": "PT10M", + "annotations": { + "description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeletClientCertificateRenewalErrors", + "expression": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0", + "for": "PT15M", + "annotations": { + "description": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes)." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeletServerCertificateRenewalErrors", + "expression": "increase(kubelet_server_expiration_renew_errors[5m]) > 0", + "for": "PT15M", + "annotations": { + "description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeQuotaAlmostFull", + "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type)(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) > 0.9 < 1", + "for": "PT15M", + "annotations": { + "description": "{{ $value | humanizePercentage }} usage of {{ $labels.resource }} in namespace {{ $labels.namespace }} in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeQuotaAlmostFull.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeQuotaFullyUsed", + "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) == 1", + "for": "PT15M", + "annotations": { + "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeQuotaExceeded", + "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) > 1", + "for": "PT15M", + "annotations": { + "description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)." + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + } + ] + } + }, + { + "name": "[concat(variables('kubernetesAlertRuleGroupName'), '-Node-level')]", + "type": "Microsoft.AlertsManagement/prometheusRuleGroups", + "apiVersion": "2023-03-01", + "location": "[parameters('location')]", + "properties": { + "description": "[concat(variables('kubernetesAlertRuleGroupDescription'), variables('version'))]", + "scopes": [ + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" + ], + "clusterName": "[parameters('clusterName')]", + "interval": "PT1M", + "rules": [ + { + "alert": "KubeNodeUnreachable", + "expression": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1", + "for": "PT15M", + "annotations": { + "description": "{{ $labels.node }} in {{ $labels.cluster}} is unreachable and some workloads may be rescheduled. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeUnreachable.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeNodeNotReady", + "expression": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0", + "for": "PT15M", + "annotations": { + "description": "{{ $labels.node }} in {{ $labels.cluster}} has been unready for more than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeNotReady.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeNodeReadinessFlapping", + "expression": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (cluster, node) > 2", + "for": "PT15M", + "annotations": { + "description": "The readiness status of node {{ $labels.node }} in {{ $labels.cluster}} has changed more than 2 times in the last 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeReadinessFlapping.md)." + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + } + ] + } + }, + { + "name": "[concat(variables('kubernetesAlertRuleGroupName'), '-Pod-level')]", + "type": "Microsoft.AlertsManagement/prometheusRuleGroups", + "apiVersion": "2023-03-01", + "location": "[parameters('location')]", + "properties": { + "description": "[concat(variables('kubernetesAlertRuleGroupDescription'), variables('version'))]", + "scopes": [ + "[parameters('azureMonitorWorkspaceResourceId')]", + "[parameters('clusterResourceId')]" + ], + "clusterName": "[parameters('clusterName')]", + "interval": "PT1M", + "rules": [ + { "alert": "KubePodCrashLooping", "expression": "max_over_time(kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\", job=\"kube-state-metrics\"}[5m]) >= 1", "for": "PT15M", @@ -73,8 +569,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "Job did not complete in time", "expression": "sum by(namespace,cluster)(kube_job_spec_completions{job=\"kube-state-metrics\"}) - sum by(namespace,cluster)(kube_job_status_succeeded{job=\"kube-state-metrics\"}) > 0 ", "for": "PT360M", @@ -95,8 +591,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "Pod container restarted in the last 1 hour", "expression": "sum by (namespace, controller, container, cluster)(increase(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[1h])* on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > 0", "for": "PT15M", @@ -117,8 +613,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "Ready state of pods is less than 80%. ", "expression": "sum by (cluster,namespace,deployment)(kube_deployment_status_replicas_ready) / sum by (cluster,namespace,deployment)(kube_deployment_spec_replicas) <.8 or sum by (cluster,namespace,deployment)(kube_daemonset_status_number_ready) / sum by (cluster,namespace,deployment)(kube_daemonset_status_desired_number_scheduled) <.8 ", "for": "PT5M", @@ -139,8 +635,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "Number of pods in failed state are greater than 0.", "expression": "sum by (cluster, namespace, controller) (kube_pod_status_phase{phase=\"failed\"} * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > 0", "for": "PT5M", @@ -161,8 +657,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "KubePodNotReadyByController", "expression": "sum by (namespace, controller, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"} ) * on(namespace, pod, cluster) group_left(controller)label_replace(kube_pod_owner,\"controller\",\"$1\",\"owner_name\",\"(.*)\")) > 0", "for": "PT15M", @@ -183,8 +679,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "KubeStatefulSetGenerationMismatch", "expression": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"} != kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}", "for": "PT15M", @@ -205,8 +701,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "KubeJobNotCompleted", "expression": "time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job=\"kube-state-metrics\"} and kube_job_status_active{job=\"kube-state-metrics\"} > 0) > 43200", "for": "PT15M", @@ -227,8 +723,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "KubeJobFailed", "expression": "kube_job_failed{job=\"kube-state-metrics\"} > 0", "for": "PT15M", @@ -249,8 +745,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "Average CPU usage per container is greater than 95%", "expression": "sum (rate(container_cpu_usage_seconds_total{image!=\"\", container!=\"POD\"}[5m])) by (pod,cluster,container,namespace) / sum(container_spec_cpu_quota{image!=\"\", container!=\"POD\"}/container_spec_cpu_period{image!=\"\", container!=\"POD\"}) by (pod,cluster,container,namespace) > .95", "for": "PT5M", @@ -271,8 +767,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "Average Memory usage per container is greater than 95%.", "expression": "avg by (namespace, controller, container, cluster)(((container_memory_working_set_bytes{container!=\"\", image!=\"\", container!=\"POD\"} / on(namespace,cluster,pod,container) group_left kube_pod_container_resource_limits{resource=\"memory\", node!=\"\"})*on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > .95)", "for": "PT10M", @@ -293,8 +789,8 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { + }, + { "alert": "KubeletPodStartUpLatencyHigh", "expression": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job=\"kubelet\"} > 60", "for": "PT10M", @@ -315,623 +811,117 @@ "actionGroupId": "[parameters('actionGroupResourceId')]" } ] - }, - { - "alert": "Average PV usage is greater than 80%", - "expression": "avg by (namespace, controller, container, cluster)(((kubelet_volume_stats_used_bytes{job=\"kubelet\"} / on(namespace,cluster,pod,container) group_left kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}) * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > .8", - "for": "PT15M", - "annotations": { - "description": "Average PV usage on pod {{ $labels.pod }} in container {{ $labels.container }} is greater than 80%" - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeDeploymentReplicasMismatch", - "expression": "( kube_deployment_spec_replicas{job=\"kube-state-metrics\"} > kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}) and ( changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m]) == 0)", - "for": "PT15M", - "annotations": { - "description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeDeploymentReplicasMismatch.md)" - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT15M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeStatefulSetReplicasMismatch", - "expression": "( kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\"}) and ( changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m]) == 0)", - "for": "PT15M", - "annotations": { - "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetReplicasMismatch.md)" - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeHpaReplicasMismatch", - "expression": "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"} !=kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}) and(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} >kube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"}) and(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} 1.5", - "for": "PT5M", - "annotations": { - "description": "Cluster {{ $labels.cluster}} has overcommitted CPU resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeCPUQuotaOvercommit.md)" - }, - "labels": { - "severity": "warning" - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeMemoryQuotaOvercommit", - "expression": "sum(min without(resource) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=~\"(memory|requests.memory)\"})) /sum(kube_node_status_allocatable{resource=\"memory\", job=\"kube-state-metrics\"}) > 1.5", - "for": "PT5M", - "annotations": { - "description": "Cluster {{ $labels.cluster}} has overcommitted memory resource requests for Namespaces. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeMemoryQuotaOvercommit.md)" - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeVersionMismatch", - "expression": "count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"git_version\",\"$1\",\"git_version\",\"(v[0-9]*.[0-9]*).*\"))) > 1", - "for": "PT15M", - "annotations": { - "description": "There are {{ $value }} different versions of Kubernetes components running in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeVersionMismatch.md)" - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeClientErrors", - "expression": "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (cluster, instance, job, namespace) / sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace)) > 0.01", - "for": "PT15M", - "annotations": { - "description": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors. Please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeClientErrors.md)" - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubePersistentVolumeFillingUp", - "expression": "kubelet_volume_stats_available_bytes{job=\"kubelet\"}/kubelet_volume_stats_capacity_bytes{job=\"kubelet\"} < 0.15 and kubelet_volume_stats_used_bytes{job=\"kubelet\"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"} == 1", - "for": "PT60M", - "annotations": { - "description": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeFillingUp.md)" - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT15M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubePersistentVolumeInodesFillingUp", - "expression": "kubelet_volume_stats_inodes_free{job=\"kubelet\"} / kubelet_volume_stats_inodes{job=\"kubelet\"} < 0.03", - "for": "PT15M", - "annotations": { - "description": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} only has {{ $value | humanizePercentage }} free inodes." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubePersistentVolumeErrors", - "expression": "kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"} > 0", - "for": "PT05M", - "annotations": { - "description": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubePersistentVolumeErrors.md)" - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "Average node CPU utilization is greater than 80%", - "expression": "( (1 - rate(node_cpu_seconds_total{job=\"node\", mode=\"idle\"}[5m]) ) / ignoring(cpu) group_left count without (cpu)( node_cpu_seconds_total{job=\"node\", mode=\"idle\"}) ) > .8 ", - "for": "PT5M", - "annotations": { - "description": "Average node CPU utilization is greater than 80%" - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { + }, + { + "alert": "Average PV usage is greater than 80%", + "expression": "avg by (namespace, controller, container, cluster)(((kubelet_volume_stats_used_bytes{job=\"kubelet\"} / on(namespace,cluster,pod,container) group_left kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}) * on(namespace, pod, cluster) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > .8", + "for": "PT15M", + "annotations": { + "description": "Average PV usage on pod {{ $labels.pod }} in container {{ $labels.container }} is greater than 80%" + }, + "enabled": true, + "severity": 3, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeDeploymentReplicasMismatch", + "expression": "( kube_deployment_spec_replicas{job=\"kube-state-metrics\"} > kube_deployment_status_replicas_available{job=\"kube-state-metrics\"}) and ( changes(kube_deployment_status_replicas_updated{job=\"kube-state-metrics\"}[10m]) == 0)", + "for": "PT15M", + "annotations": { + "description": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeDeploymentReplicasMismatch.md)" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeStatefulSetReplicasMismatch", + "expression": "( kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"} != kube_statefulset_status_replicas{job=\"kube-state-metrics\"}) and ( changes(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}[10m]) == 0)", + "for": "PT15M", + "annotations": { + "description": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} in {{ $labels.cluster}} replica mismatch. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeStatefulSetReplicasMismatch.md)" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT10M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeHpaReplicasMismatch", + "expression": "(kube_horizontalpodautoscaler_status_desired_replicas{job=\"kube-state-metrics\"} !=kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"}) and(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} >kube_horizontalpodautoscaler_spec_min_replicas{job=\"kube-state-metrics\"}) and(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} .8", - "for": "PT05M", - "annotations": { - "description": "Working set memory for a node is greater than 80%." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT15M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "Number of OOM killed containers is greater than 0", - "expression": "sum by (cluster,container,controller,namespace)(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"} * on(cluster,namespace,pod) group_left(controller) label_replace(kube_pod_owner, \"controller\", \"$1\", \"owner_name\", \"(.*)\")) > 0", - "for": "PT5M", - "annotations": { - "description": "Number of OOM killed containers is greater than 0" - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeNodeUnreachable", - "expression": "(kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} unless ignoring(key,value) kube_node_spec_taint{job=\"kube-state-metrics\",key=~\"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn\"}) == 1", - "for": "PT15M", - "annotations": { - "description": "{{ $labels.node }} in {{ $labels.cluster}} is unreachable and some workloads may be rescheduled. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeUnreachable.md)." - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeNodeNotReady", - "expression": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0", - "for": "PT15M", - "annotations": { - "description": "{{ $labels.node }} in {{ $labels.cluster}} has been unready for more than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeNotReady.md)." - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeNodeReadinessFlapping", - "expression": "sum(changes(kube_node_status_condition{status=\"true\",condition=\"Ready\"}[15m])) by (cluster, node) > 2", - "for": "PT15M", - "annotations": { - "description": "The readiness status of node {{ $labels.node }} in {{ $labels.cluster}} has changed more than 2 times in the last 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeNodeReadinessFlapping.md)." - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeContainerWaiting", - "expression": "sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0", - "for": "PT60M", - "annotations": { - "description": "pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour." - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeDaemonSetNotScheduled", - "expression": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"} - kube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0", - "for": "PT15M", - "annotations": { - "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled." - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeDaemonSetMisScheduled", - "expression": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} > 0", - "for": "PT15M", - "annotations": { - "description": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run." - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeletClientCertificateExpiration", - "expression": "kubelet_certificate_manager_client_ttl_seconds < 7 * 24 * 3600", - "for": "PT5M", - "annotations": { - "description": "Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT15M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeletServerCertificateExpiration", - "expression": "kubelet_certificate_manager_server_ttl_seconds < 7 * 24 * 3600", - "for": "PT10M", - "annotations": { - "description": "Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeletClientCertificateRenewalErrors", - "expression": "increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0", - "for": "PT15M", - "annotations": { - "description": "Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes)." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeletServerCertificateRenewalErrors", - "expression": "increase(kubelet_server_expiration_renew_errors[5m]) > 0", - "for": "PT15M", - "annotations": { - "description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeQuotaAlmostFull", - "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type)(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) > 0.9 < 1", - "for": "PT15M", - "annotations": { - "description": "{{ $value | humanizePercentage }} usage of {{ $labels.resource }} in namespace {{ $labels.namespace }} in {{ $labels.cluster}}. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeQuotaAlmostFull.md)." - }, - "enabled": true, - "severity": 3, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeQuotaFullyUsed", - "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) == 1", - "for": "PT15M", - "annotations": { - "description": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - }, - { - "alert": "KubeQuotaExceeded", - "expression": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"} / ignoring(instance, job, type) (kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0) > 1", - "for": "PT15M", - "annotations": { - "description": "Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes)." - }, - "enabled": true, - "severity": 4, - "resolveConfiguration": { - "autoResolved": true, - "timeToResolve": "PT10M" - }, - "labels": { - "severity": "warning" - }, - "actions": [ - { - "actionGroupId": "[parameters('actionGroupResourceId')]" - } - ] - } + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + }, + { + "alert": "KubeHpaMaxedOut", + "expression": "kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\"} ==kube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\"}", + "for": "PT15M", + "annotations": { + "description": "Horizontal Pod Autoscaler in {{ $labels.cluster}} has been running at max replicas for longer than 15 minutes. For more information on this alert, please refer to this [link](https://github.com/prometheus-operator/runbooks/blob/main/content/runbooks/kubernetes/KubeHpaMaxedOut.md)" + }, + "enabled": true, + "severity": 4, + "resolveConfiguration": { + "autoResolved": true, + "timeToResolve": "PT15M" + }, + "labels": { + "severity": "warning" + }, + "actions": [ + { + "actionGroupId": "[parameters('actionGroupResourceId')]" + } + ] + } ] } }