forked from MithunTechnologiesDevOps/Kubernates-Manifests
-
Notifications
You must be signed in to change notification settings - Fork 1
/
prometheusvalues.yml
78 lines (76 loc) · 3.48 KB
/
prometheusvalues.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
alertmanagerFiles:
alertmanager.yml:
global:
resolve_timeout: 1m
# slack_api_url: ''
receivers:
- name: 'gmail-notifications'
email_configs:
- to: [email protected]
from: [email protected] # Update your from mail id here
smarthost: smtp.gmail.com:587
auth_username: [email protected] # Update your from mail id here
auth_identity: [email protected] # Update your from mail id here
auth_password: XXXX # Update your password here
send_resolved: true
headers:
subject: " Prometheus - Alert "
text: "{{ range .Alerts }} Hi, \n{{ .Annotations.summary }} \n {{ .Annotations.description }} {{end}} "
# slack_configs:
# - channel: '@you'
# send_resolved: true
route:
group_wait: 10s
group_interval: 2m
receiver: 'gmail-notifications'
repeat_interval: 2m
serverFiles:
alerting_rules.yml:
groups:
- name: NodeDown
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up{job="kubernetes-nodes"} == 0
for: 2m
labels:
severity: page
annotations:
host: "{{ $labels.kubernetes_io_hostname }}"
summary: "Instance down"
description: "Node {{ $labels.kubernetes_io_hostname }}has been down for more than 5 minutes."
- name: low_memory_alert
rules:
- alert: LowMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 15
for: 2m
labels:
severity: warning
annotations:
host: "{{ $labels.kubernetes_node }}"
summary: "{{ $labels.kubernetes_node }} Host is low on memory. Only {{ $value }}% left"
description: "{{ $labels.kubernetes_node }} node is low on memory. Only {{ $value }}% left"
- alert: KubePersistentVolumeErrors
expr: kube_persistentvolume_status_phase{job="kubernetes-service-endpoints",phase=~"Failed|Pending"} > 0
for: 2m
labels:
severity: critical
annotations:
description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.
summary: PersistentVolume is having issues with provisioning.
- alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kubernetes-service-endpoints",namespace=~".*"}[5m]) * 60 * 5 > 0
for: 2m
labels:
severity: warning
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
summary: Pod is crash looping.
- alert: KubePodNotReady
expr: sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kubernetes-service-endpoints",namespace=~".*",phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))) > 0
for: 2m
labels:
severity: warning
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 5 minutes.
summary: Pod has been in a non-ready state for more than 2 minutes.