-
Notifications
You must be signed in to change notification settings - Fork 194
/
mellanox_hca_temp
executable file
·60 lines (53 loc) · 1.74 KB
/
mellanox_hca_temp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env bash
#
# Script to read Mellanox HCA temperature using the Mellanox mget_temp_ext tool
#
# Copyright 2018 The Prometheus Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Author: Jan Phillip Greimann <[email protected]>
set -eu
# check if root
if [ "$EUID" -ne 0 ]; then
echo "${0##*/}: Please run as root!" >&2
exit 1
fi
# check if programs are installed
if ! command -v mget_temp_ext >/dev/null 2>&1; then
echo "${0##*/}: mget_temp_ext is not installed. Aborting." >&2
exit 1
fi
cat <<EOF
# HELP node_infiniband_hca_temp_celsius Celsius temperature of Mellanox InfiniBand HCA.
# TYPE node_infiniband_hca_temp_celsius gauge
EOF
# run for each found Mellanox device
for dev in /sys/class/infiniband/*; do
if test ! -d "$dev"; then
continue
fi
device="${dev##*/}"
# get temperature
if temperature="$(mget_temp_ext -d "${device}")"; then
# output
echo "node_infiniband_hca_temp_celsius{device=\"${device}\"} ${temperature//[[:space:]]/}"
else
echo "${0##*/}: Failed to get temperature from InfiniBand HCA '${device}'!" >&2
fi
done
# if device is empty, no device was found
if [ -z "${device-}" ]; then
echo "${0##*/}: No InfiniBand HCA device found!" >&2
exit 1
fi