Skip to content

Commit

Permalink
Merge pull request #176 from retinadata/bmc_watchdog
Browse files Browse the repository at this point in the history
Bmc watchdog
  • Loading branch information
bitfehler authored Nov 15, 2023
2 parents d4c9372 + 46a9390 commit b302e65
Show file tree
Hide file tree
Showing 6 changed files with 321 additions and 12 deletions.
164 changes: 164 additions & 0 deletions collector_bmc_watchdog.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"

"github.com/prometheus-community/ipmi_exporter/freeipmi"
)

const (
BMCWatchdogCollectorName CollectorName = "bmc-watchdog"
)

var (
bmcWatchdogTimerDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "bmc_watchdog", "timer_state"),
"Watchdog timer running (1: running, 0: stopped)",
[]string{},
nil,
)
watchdogTimerUses = []string{"BIOS FRB2", "BIOS POST", "OS LOAD", "SMS/OS", "OEM"}
bmcWatchdogTimerUseDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "bmc_watchdog", "timer_use_state"),
"Watchdog timer use (1: active, 0: inactive)",
[]string{"name"},
nil,
)
bmcWatchdogLoggingDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "bmc_watchdog", "logging_state"),
"Watchdog log flag (1: Enabled, 0: Disabled / note: reverse of freeipmi)",
[]string{},
nil,
)
watchdogTimeoutActions = []string{"None", "Hard Reset", "Power Down", "Power Cycle"}
bmcWatchdogTimeoutActionDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "bmc_watchdog", "timeout_action_state"),
"Watchdog timeout action (1: active, 0: inactive)",
[]string{"action"},
nil,
)
watchdogPretimeoutInterrupts = []string{"None", "SMI", "NMI / Diagnostic Interrupt", "Messaging Interrupt"}
bmcWatchdogPretimeoutInterruptDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "bmc_watchdog", "pretimeout_interrupt_state"),
"Watchdog pre-timeout interrupt (1: active, 0: inactive)",
[]string{"interrupt"},
nil,
)
bmcWatchdogPretimeoutIntervalDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "bmc_watchdog", "pretimeout_interval_seconds"),
"Watchdog pre-timeout interval in seconds",
[]string{},
nil,
)
bmcWatchdogInitialCountdownDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "bmc_watchdog", "initial_countdown_seconds"),
"Watchdog initial countdown in seconds",
[]string{},
nil,
)
bmcWatchdogCurrentCountdownDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "bmc_watchdog", "current_countdown_seconds"),
"Watchdog initial countdown in seconds",
[]string{},
nil,
)
)

type BMCWatchdogCollector struct{}

func (c BMCWatchdogCollector) Name() CollectorName {
return BMCWatchdogCollectorName
}

func (c BMCWatchdogCollector) Cmd() string {
return "bmc-watchdog"
}

func (c BMCWatchdogCollector) Args() []string {
return []string{"--get"}
}

func (c BMCWatchdogCollector) Collect(result freeipmi.Result, ch chan<- prometheus.Metric, target ipmiTarget) (int, error) {
timerState, err := freeipmi.GetBMCWatchdogTimerState(result)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect BMC watchdog timer", "target", targetName(target.host), "error", err)
return 0, err
}
currentTimerUse, err := freeipmi.GetBMCWatchdogTimerUse(result)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect BMC watchdog timer use", "target", targetName(target.host), "error", err)
return 0, err
}
loggingState, err := freeipmi.GetBMCWatchdogLoggingState(result)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect BMC watchdog logging", "target", targetName(target.host), "error", err)
return 0, err
}
currentTimeoutAction, err := freeipmi.GetBMCWatchdogTimeoutAction(result)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect BMC watchdog timeout action", "target", targetName(target.host), "error", err)
return 0, err
}
currentPretimeoutInterrupt, err := freeipmi.GetBMCWatchdogPretimeoutInterrupt(result)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect BMC watchdog pretimeout interrupt", "target", targetName(target.host), "error", err)
return 0, err
}
pretimeoutInterval, err := freeipmi.GetBMCWatchdogPretimeoutInterval(result)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect BMC watchdog pretimeout interval", "target", targetName(target.host), "error", err)
return 0, err
}
initialCountdown, err := freeipmi.GetBMCWatchdogInitialCountdown(result)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect BMC watchdog initial countdown", "target", targetName(target.host), "error", err)
return 0, err
}
currentCountdown, err := freeipmi.GetBMCWatchdogCurrentCountdown(result)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect BMC watchdog current countdown", "target", targetName(target.host), "error", err)
return 0, err
}

ch <- prometheus.MustNewConstMetric(bmcWatchdogTimerDesc, prometheus.GaugeValue, timerState)
for _, timerUse := range watchdogTimerUses {
if currentTimerUse == timerUse {
ch <- prometheus.MustNewConstMetric(bmcWatchdogTimerUseDesc, prometheus.GaugeValue, 1, timerUse)
} else {
ch <- prometheus.MustNewConstMetric(bmcWatchdogTimerUseDesc, prometheus.GaugeValue, 0, timerUse)
}
}
ch <- prometheus.MustNewConstMetric(bmcWatchdogLoggingDesc, prometheus.GaugeValue, loggingState)
for _, timeoutAction := range watchdogTimeoutActions {
if currentTimeoutAction == timeoutAction {
ch <- prometheus.MustNewConstMetric(bmcWatchdogTimeoutActionDesc, prometheus.GaugeValue, 1, timeoutAction)
} else {
ch <- prometheus.MustNewConstMetric(bmcWatchdogTimeoutActionDesc, prometheus.GaugeValue, 0, timeoutAction)
}
}
for _, pretimeoutInterrupt := range watchdogPretimeoutInterrupts {
if currentPretimeoutInterrupt == pretimeoutInterrupt {
ch <- prometheus.MustNewConstMetric(bmcWatchdogPretimeoutInterruptDesc, prometheus.GaugeValue, 1, pretimeoutInterrupt)
} else {
ch <- prometheus.MustNewConstMetric(bmcWatchdogPretimeoutInterruptDesc, prometheus.GaugeValue, 0, pretimeoutInterrupt)
}
}
ch <- prometheus.MustNewConstMetric(bmcWatchdogPretimeoutIntervalDesc, prometheus.GaugeValue, pretimeoutInterval)
ch <- prometheus.MustNewConstMetric(bmcWatchdogInitialCountdownDesc, prometheus.GaugeValue, initialCountdown)
ch <- prometheus.MustNewConstMetric(bmcWatchdogCurrentCountdownDesc, prometheus.GaugeValue, currentCountdown)
return 1, nil
}
2 changes: 2 additions & 0 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ func (c CollectorName) GetInstance() (collector, error) {
return IPMICollector{}, nil
case BMCCollectorName:
return BMCCollector{}, nil
case BMCWatchdogCollectorName:
return BMCWatchdogCollector{}, nil
case SELCollectorName:
return SELCollector{}, nil
case DCMICollectorName:
Expand Down
52 changes: 52 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ These metrics provide data about the scrape itself:
power consumption metrics (see below) will not be available
- `bmc`: collects BMC details. If it fails, BMC info metrics (see below)
will not be available
- `bmc-watchdog`: collects status of the watchdog. If it fails, BMC watchdog
metrics (see below) will not be available
- `chassis`: collects the current chassis power state (on/off). If it fails,
the chassis power state metric (see below) will not be available
- `sel`: collects system event log (SEL) details. If it fails, SEL metrics
Expand All @@ -36,6 +38,56 @@ version). Example:
**Note:** some systems do not expose the system's firmware version, in which
case it will be exported as `"N/A"`.

## BMC Watchdog

These metrics are only provided if the `bmc-watchdog` collector is enabled.

The metric `ipmi_bmc_watchdog_timer_state` shows whether the watchdog timer is
currently running (1) or stopped (0).

The metric `ipmi_bmc_watchdog_timer_use_state` shows which timer use is
currently active. Per freeipmi bmc-watchdog manual there are 5 uses. This metric
will return 1 for only one of those and 0 for the rest.

ipmi_bmc_watchdog_timer_use_state{name="BIOS FRB2"} 1
ipmi_bmc_watchdog_timer_use_state{name="BIOS POST"} 0
ipmi_bmc_watchdog_timer_use_state{name="OEM"} 0
ipmi_bmc_watchdog_timer_use_state{name="OS LOAD"} 0
ipmi_bmc_watchdog_timer_use_state{name="SMS/OS"} 0

The metric `ipmi_bmc_watchdog_logging_state` shows whether the watchdog logging
is enabled (1) or not (0). (Note: This is reversed in freeipmi where 0 enables
logging and 1 disables it)

The metric `ipmi_bmc_watchdog_timeout_action_state` shows whether watchdog will
take an action on timeout, and if so which one. Per freeipmi bmc-watchdog manual
there are 3 actions. If no action is configured it will be reported as `None`.

ipmi_bmc_watchdog_timeout_action_state{action="Hard Reset"} 0
ipmi_bmc_watchdog_timeout_action_state{action="None"} 0
ipmi_bmc_watchdog_timeout_action_state{action="Power Cycle"} 1
ipmi_bmc_watchdog_timeout_action_state{action="Power Down"} 0

The metric `ipmi_bmc_watchdog_timeout_action_state` shows whether a pre-timeout
interrupt is currently active and if so, which one. Per freeipmi bmc-watchdog
manual there are 3 interrupts. If no interrupt is configured it will be reported
as `None`.

ipmi_bmc_watchdog_pretimeout_interrupt_state{interrupt="Messaging Interrupt"} 0
ipmi_bmc_watchdog_pretimeout_interrupt_state{interrupt="NMI / Diagnostic Interrupt"} 0
ipmi_bmc_watchdog_pretimeout_interrupt_state{interrupt="None"} 1
ipmi_bmc_watchdog_pretimeout_interrupt_state{interrupt="SMI"} 0

The metric `ipmi_bmc_watchdog_pretimeout_interval_seconds` shows the current
pre-timeout interval as measured in seconds.

The metric `ipmi_bmc_watchdog_initial_countdown_seconds` shows the configured
countdown in seconds.

The metric `ipmi_bmc_watchdog_current_countdown_seconds` shows the current
countdown in seconds.


## Chassis Power State

This metric is only provided if the `chassis` collector is enabled.
Expand Down
110 changes: 100 additions & 10 deletions freeipmi/freeipmi.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,24 @@ import (
)

var (
ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(`^Power Measurement\s*:\s*(?P<value>Active|Not\sAvailable).*`)
ipmiDCMICurrentPowerRegex = regexp.MustCompile(`^Current Power\s*:\s*(?P<value>[0-9.]*)\s*Watts.*`)
ipmiChassisPowerRegex = regexp.MustCompile(`^System Power\s*:\s(?P<value>.*)`)
ipmiChassisDriveFaultRegex = regexp.MustCompile(`^Drive Fault\s*:\s(?P<value>.*)`)
ipmiChassisCoolingFaultRegex = regexp.MustCompile(`^Cooling/fan fault\s*:\s(?P<value>.*)`)
ipmiSELEntriesRegex = regexp.MustCompile(`^Number of log entries\s*:\s(?P<value>[0-9.]*)`)
ipmiSELFreeSpaceRegex = regexp.MustCompile(`^Free space remaining\s*:\s(?P<value>[0-9.]*)\s*bytes.*`)
bmcInfoFirmwareRevisionRegex = regexp.MustCompile(`^Firmware Revision\s*:\s*(?P<value>[0-9.]*).*`)
bmcInfoSystemFirmwareVersionRegex = regexp.MustCompile(`^System Firmware Version\s*:\s*(?P<value>[0-9.]*).*`)
bmcInfoManufacturerIDRegex = regexp.MustCompile(`^Manufacturer ID\s*:\s*(?P<value>.*)`)
ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(`^Power Measurement\s*:\s*(?P<value>Active|Not\sAvailable).*`)
ipmiDCMICurrentPowerRegex = regexp.MustCompile(`^Current Power\s*:\s*(?P<value>[0-9.]*)\s*Watts.*`)
ipmiChassisPowerRegex = regexp.MustCompile(`^System Power\s*:\s(?P<value>.*)`)
ipmiChassisDriveFaultRegex = regexp.MustCompile(`^Drive Fault\s*:\s(?P<value>.*)`)
ipmiChassisCoolingFaultRegex = regexp.MustCompile(`^Cooling/fan fault\s*:\s(?P<value>.*)`)
ipmiSELEntriesRegex = regexp.MustCompile(`^Number of log entries\s*:\s(?P<value>[0-9.]*)`)
ipmiSELFreeSpaceRegex = regexp.MustCompile(`^Free space remaining\s*:\s(?P<value>[0-9.]*)\s*bytes.*`)
bmcInfoFirmwareRevisionRegex = regexp.MustCompile(`^Firmware Revision\s*:\s*(?P<value>[0-9.]*).*`)
bmcInfoSystemFirmwareVersionRegex = regexp.MustCompile(`^System Firmware Version\s*:\s*(?P<value>[0-9.]*).*`)
bmcInfoManufacturerIDRegex = regexp.MustCompile(`^Manufacturer ID\s*:\s*(?P<value>.*)`)
bmcWatchdogTimerStateRegex = regexp.MustCompile(`^Timer:\s*(?P<value>Running|Stopped)`)
bmcWatchdogTimerUseRegex = regexp.MustCompile(`^Timer Use:\s*(?P<value>.*)`)
bmcWatchdogTimerLoggingRegex = regexp.MustCompile(`^Logging:\s*(?P<value>Enabled|Disabled)`)
bmcWatchdogTimeoutActionRegex = regexp.MustCompile(`^Timeout Action:\s*(?P<value>.*)`)
bmcWatchdogPretimeoutInterruptRegex = regexp.MustCompile(`^Pre-Timeout Interrupt:\s*(?P<value>.*)`)
bmcWatchdogPretimeoutIntervalRegex = regexp.MustCompile(`^Pre-Timeout Interval:\s*(?P<value>[0-9.]*)\s*seconds.*`)
bmcWatchdogInitialCountdownRegex = regexp.MustCompile(`^Initial Countdown:\s*(?P<value>[0-9.]*)\s*seconds.*`)
bmcWatchdogCurrentCountdownRegex = regexp.MustCompile(`^Current Countdown:\s*(?P<value>[0-9.]*)\s*seconds.*`)
)

// Result represents the outcome of a call to one of the FreeIPMI tools.
Expand Down Expand Up @@ -327,3 +335,85 @@ func GetRawOctets(ipmiOutput Result) ([]string, error) {
octets := strings.Split(strOutput[6:], " ")
return octets, nil
}

func GetBMCWatchdogTimerState(ipmiOutput Result) (float64, error) {
if ipmiOutput.err != nil {
return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
}
value, err := getValue(ipmiOutput.output, bmcWatchdogTimerStateRegex)
if err != nil {
return -1, err
}
if value == "Running" {
return 1, err
}
return 0, err
}

func GetBMCWatchdogTimerUse(ipmiOutput Result) (string, error) {
if ipmiOutput.err != nil {
return "", fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
}
return getValue(ipmiOutput.output, bmcWatchdogTimerUseRegex)
}

func GetBMCWatchdogLoggingState(ipmiOutput Result) (float64, error) {
if ipmiOutput.err != nil {
return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
}
value, err := getValue(ipmiOutput.output, bmcWatchdogTimerLoggingRegex)
if err != nil {
return -1, err
}
if value == "Enabled" {
return 1, err
}
return 0, err
}

func GetBMCWatchdogTimeoutAction(ipmiOutput Result) (string, error) {
if ipmiOutput.err != nil {
return "", fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
}
return getValue(ipmiOutput.output, bmcWatchdogTimeoutActionRegex)
}

func GetBMCWatchdogPretimeoutInterrupt(ipmiOutput Result) (string, error) {
if ipmiOutput.err != nil {
return "", fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
}
return getValue(ipmiOutput.output, bmcWatchdogPretimeoutInterruptRegex)
}

func GetBMCWatchdogPretimeoutInterval(ipmiOutput Result) (float64, error) {
if ipmiOutput.err != nil {
return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
}
value, err := getValue(ipmiOutput.output, bmcWatchdogPretimeoutIntervalRegex)
if err != nil {
return -1, err
}
return strconv.ParseFloat(value, 64)
}

func GetBMCWatchdogInitialCountdown(ipmiOutput Result) (float64, error) {
if ipmiOutput.err != nil {
return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
}
value, err := getValue(ipmiOutput.output, bmcWatchdogInitialCountdownRegex)
if err != nil {
return -1, err
}
return strconv.ParseFloat(value, 64)
}

func GetBMCWatchdogCurrentCountdown(ipmiOutput Result) (float64, error) {
if ipmiOutput.err != nil {
return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
}
value, err := getValue(ipmiOutput.output, bmcWatchdogCurrentCountdownRegex)
if err != nil {
return -1, err
}
return strconv.ParseFloat(value, 64)
}
2 changes: 1 addition & 1 deletion ipmi_local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# In most cases, this should work without using a config file at all.
modules:
default:
# Available collectors are bmc, ipmi, chassis, dcmi, sel, and sm-lan-mode
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel, and sm-lan-mode
collectors:
- bmc
- ipmi
Expand Down
3 changes: 2 additions & 1 deletion ipmi_remote.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ modules:
# timeout in Prometheus accordingly.
# Must be larger than the retransmission timeout, which defaults to 1000.
timeout: 10000
# Available collectors are bmc, ipmi, chassis, dcmi, sel, and sm-lan-mode
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel,
# and sm-lan-mode
# If _not_ specified, bmc, ipmi, chassis, and dcmi are used
collectors:
- bmc
Expand Down

0 comments on commit b302e65

Please sign in to comment.