-
Notifications
You must be signed in to change notification settings - Fork 65
/
megaraid.sh
executable file
·111 lines (104 loc) · 4.14 KB
/
megaraid.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/bin/bash
# Rackspace Cloud Monitoring Plug-In
# megaraid plugin to query SMART status of drives attached to LSI megaraid or
# DELL PERC {3,700} raid controllers.
#
# ----------------------------------------------------------------------------
# "THE BEER-WARE LICENSE" (Revision 42):
# <[email protected]> wrote this file. As long as you retain this notice
# you can do whatever you want with this stuff. If we meet some day, and you
# think this stuff is worth it, you can buy me a beer in return
# ----------------------------------------------------------------------------
#
# Usage:
# Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
#
# This plugin returns 5 metrics:
# - failed : the number of drives in failed state,
# - prefail : the number of drives in prefail state,
# - unknown : the number of drives for which the smart state could not
# be determined,
# - ok : the number of drives in OK state,
# - report : a string reporting the drive id, vendor, serial number
# as well as the smart state for non-ok drives.
# e.g. /dev/bus/0 -d megaraid,4 SEAGATE 6SL28GNF FAILED \
# ^controller & drive ids ^vendor ^serial# ^state
# ( HARDWARE IMPENDING FAILURE GENERAL HARD DRIVE FAILURE [asc=5d, ascq=10] )
# ^SMART health status for this drive
#
# The following is an example 'criteria' for a Rackspace Monitoring Alarm:
#
# if (metric['failed'] != 0) {
# return new AlarmStatus(CRITICAL, '#{failed} failed drive(s): #{report}');
# }
#
# if (metric['prefail'] != 0) {
# return new AlarmStatus(WARNING, '#{prefail} prefail drive(s): #{report}');
# }
#
# if (metric['unknown'] != 0) {
# return new AlarmStatus(WARNING, '#{unknown} unknown drive(s): #{report}');
# }
#
# return new AlarmStatus(OK, '#{ok} drive(s) OK');
#
# Things to keep in mind:
# - this plugin needs a fairly recent version of smartmontools (tested OK with 6.2)
# (apt-get install smartmontools) but does NOT need megacli.
# - on big and loaded arrays, the plugin can take more than 10s (default agent plugin
# timeout) to complete. Some disks are slower than others, not surprisingly.
# - as of now, this plugin only checks individual drives and not the status of the
# array as seen by the controller. I'd add it, but it seems hard to extract without
# megacli which I'm trying to stay away from. If you know of a way, please let me
# know.
#
#
SMARTCTL=$(which smartctl)
OK_CNT=0
PREFAIL_CNT=0
FAILED_CNT=0
UNKNOWN_CNT=0
REPORT=""
# discover all drives
DEVLIST=$(${SMARTCTL} --scan 2>/dev/null)
if [ $? -ne 0 ]
then
echo status failed to perform drive discovery
exit 1
fi
while read DEV
do
STAT=$(${SMARTCTL} ${DEV} --info --health 2>/dev/null)
STATRC=$?
SHS=$(echo "${STAT}" | grep -i 'smart health status:' | cut -d':' -f2)
DRIVE_ID=$(echo "${STAT}" | grep -iE '(vendor:|serial number:)' | cut -d':' -f2 | xargs)
# Bit 3: SMART status check returned "DISK FAILING".
if [ $((${STATRC} & (2**3))) -ne 0 ]; then
((FAILED_CNT++))
REPORT="${REPORT} ${DEV} ${DRIVE_ID} FAILED (${SHS} ) "
# Bit 4: We found prefail Attributes <= threshold.
# Bit 5: SMART status check returned "DISK OK" but we found that some (usage or prefail)
# attributes have been <= threshold at some time in the past.
elif [ $((${STATRC} & (2**4) | ${STATRC} & (2**5))) -ne 0 ]; then
((PREFAIL_CNT++))
REPORT="${REPORT} ${DEV} ${DRIVE_ID} PREFAIL (${SHS} ) "
# Anything else (drive open failed, smart command failed, etc.) maps to unknown to me
elif [ ${STATRC} -ne 0 ]; then
((UNKNOWN_CNT++))
REPORT="${REPORT} ${DEV} ${DRIVE_ID} UNKNOWN (${SHS} ) "
else
((OK_CNT++))
fi
# only care for /dev/bus devices. /dev/sd* are logical disks
# and do not respond to any SMART command.
done < <(echo "${DEVLIST}" | grep /dev/bus/ | cut -d'#' -f1)
if [ "z${REPORT}" == "z" ]; then
REPORT="all drives OK"
fi
echo "status smart status retrieved"
echo "metric failed uint32 ${FAILED_CNT}"
echo "metric prefail uint32 ${PREFAIL_CNT}"
echo "metric unknown uint32 ${UNKNOWN_CNT}"
echo "metric ok uint32 ${OK_CNT}"
echo "metric report string ${REPORT}"
exit 0