From 5a4e594b689ba25651de911fef4e8f89f83aec72 Mon Sep 17 00:00:00 2001 From: Erdem Agaoglu Date: Tue, 7 Nov 2023 14:47:01 +0300 Subject: [PATCH 1/2] Add collector for bmc-watchdog Some BMC's provide a watchdog functionality, i.e. taking some specified action if a timer is not reset within a specified time. freeipmi tools have a bmc-watchdog command to control and also report the current status of such function. This collector reports that information. Signed-off-by: Erdem Agaoglu --- collector_bmc_watchdog.go | 164 ++++++++++++++++++++++++++++++++++++++ config.go | 2 + freeipmi/freeipmi.go | 110 ++++++++++++++++++++++--- 3 files changed, 266 insertions(+), 10 deletions(-) create mode 100644 collector_bmc_watchdog.go diff --git a/collector_bmc_watchdog.go b/collector_bmc_watchdog.go new file mode 100644 index 0000000..86d7f9d --- /dev/null +++ b/collector_bmc_watchdog.go @@ -0,0 +1,164 @@ +// Copyright 2021 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus" + + "github.com/prometheus-community/ipmi_exporter/freeipmi" +) + +const ( + BMCWatchdogCollectorName CollectorName = "bmc-watchdog" +) + +var ( + bmcWatchdogTimerDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "bmc_watchdog", "timer_state"), + "Watchdog timer running (1: running, 0: stopped)", + []string{}, + nil, + ) + watchdogTimerUses = []string{"BIOS FRB2", "BIOS POST", "OS LOAD", "SMS/OS", "OEM"} + bmcWatchdogTimerUseDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "bmc_watchdog", "timer_use_state"), + "Watchdog timer use (1: active, 0: inactive)", + []string{"name"}, + nil, + ) + bmcWatchdogLoggingDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "bmc_watchdog", "logging_state"), + "Watchdog log flag (1: Enabled, 0: Disabled / note: reverse of freeipmi)", + []string{}, + nil, + ) + watchdogTimeoutActions = []string{"None", "Hard Reset", "Power Down", "Power Cycle"} + bmcWatchdogTimeoutActionDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "bmc_watchdog", "timeout_action_state"), + "Watchdog timeout action (1: active, 0: inactive)", + []string{"action"}, + nil, + ) + watchdogPretimeoutInterrupts = []string{"None", "SMI", "NMI / Diagnostic Interrupt", "Messaging Interrupt"} + bmcWatchdogPretimeoutInterruptDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "bmc_watchdog", "pretimeout_interrupt_state"), + "Watchdog pre-timeout interrupt (1: active, 0: inactive)", + []string{"interrupt"}, + nil, + ) + bmcWatchdogPretimeoutIntervalDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "bmc_watchdog", "pretimeout_interval_seconds"), + "Watchdog pre-timeout interval in seconds", + []string{}, + nil, + ) + bmcWatchdogInitialCountdownDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "bmc_watchdog", "initial_countdown_seconds"), + "Watchdog initial countdown in seconds", + []string{}, + nil, + ) + bmcWatchdogCurrentCountdownDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "bmc_watchdog", "current_countdown_seconds"), + "Watchdog initial countdown in seconds", + []string{}, + nil, + ) +) + +type BMCWatchdogCollector struct{} + +func (c BMCWatchdogCollector) Name() CollectorName { + return BMCWatchdogCollectorName +} + +func (c BMCWatchdogCollector) Cmd() string { + return "bmc-watchdog" +} + +func (c BMCWatchdogCollector) Args() []string { + return []string{"--get"} +} + +func (c BMCWatchdogCollector) Collect(result freeipmi.Result, ch chan<- prometheus.Metric, target ipmiTarget) (int, error) { + timerState, err := freeipmi.GetBMCWatchdogTimerState(result) + if err != nil { + level.Error(logger).Log("msg", "Failed to collect BMC watchdog timer", "target", targetName(target.host), "error", err) + return 0, err + } + currentTimerUse, err := freeipmi.GetBMCWatchdogTimerUse(result) + if err != nil { + level.Error(logger).Log("msg", "Failed to collect BMC watchdog timer use", "target", targetName(target.host), "error", err) + return 0, err + } + loggingState, err := freeipmi.GetBMCWatchdogLoggingState(result) + if err != nil { + level.Error(logger).Log("msg", "Failed to collect BMC watchdog logging", "target", targetName(target.host), "error", err) + return 0, err + } + currentTimeoutAction, err := freeipmi.GetBMCWatchdogTimeoutAction(result) + if err != nil { + level.Error(logger).Log("msg", "Failed to collect BMC watchdog timeout action", "target", targetName(target.host), "error", err) + return 0, err + } + currentPretimeoutInterrupt, err := freeipmi.GetBMCWatchdogPretimeoutInterrupt(result) + if err != nil { + level.Error(logger).Log("msg", "Failed to collect BMC watchdog pretimeout interrupt", "target", targetName(target.host), "error", err) + return 0, err + } + pretimeoutInterval, err := freeipmi.GetBMCWatchdogPretimeoutInterval(result) + if err != nil { + level.Error(logger).Log("msg", "Failed to collect BMC watchdog pretimeout interval", "target", targetName(target.host), "error", err) + return 0, err + } + initialCountdown, err := freeipmi.GetBMCWatchdogInitialCountdown(result) + if err != nil { + level.Error(logger).Log("msg", "Failed to collect BMC watchdog initial countdown", "target", targetName(target.host), "error", err) + return 0, err + } + currentCountdown, err := freeipmi.GetBMCWatchdogCurrentCountdown(result) + if err != nil { + level.Error(logger).Log("msg", "Failed to collect BMC watchdog current countdown", "target", targetName(target.host), "error", err) + return 0, err + } + + ch <- prometheus.MustNewConstMetric(bmcWatchdogTimerDesc, prometheus.GaugeValue, timerState) + for _, timerUse := range watchdogTimerUses { + if currentTimerUse == timerUse { + ch <- prometheus.MustNewConstMetric(bmcWatchdogTimerUseDesc, prometheus.GaugeValue, 1, timerUse) + } else { + ch <- prometheus.MustNewConstMetric(bmcWatchdogTimerUseDesc, prometheus.GaugeValue, 0, timerUse) + } + } + ch <- prometheus.MustNewConstMetric(bmcWatchdogLoggingDesc, prometheus.GaugeValue, loggingState) + for _, timeoutAction := range watchdogTimeoutActions { + if currentTimeoutAction == timeoutAction { + ch <- prometheus.MustNewConstMetric(bmcWatchdogTimeoutActionDesc, prometheus.GaugeValue, 1, timeoutAction) + } else { + ch <- prometheus.MustNewConstMetric(bmcWatchdogTimeoutActionDesc, prometheus.GaugeValue, 0, timeoutAction) + } + } + for _, pretimeoutInterrupt := range watchdogPretimeoutInterrupts { + if currentPretimeoutInterrupt == pretimeoutInterrupt { + ch <- prometheus.MustNewConstMetric(bmcWatchdogPretimeoutInterruptDesc, prometheus.GaugeValue, 1, pretimeoutInterrupt) + } else { + ch <- prometheus.MustNewConstMetric(bmcWatchdogPretimeoutInterruptDesc, prometheus.GaugeValue, 0, pretimeoutInterrupt) + } + } + ch <- prometheus.MustNewConstMetric(bmcWatchdogPretimeoutIntervalDesc, prometheus.GaugeValue, pretimeoutInterval) + ch <- prometheus.MustNewConstMetric(bmcWatchdogInitialCountdownDesc, prometheus.GaugeValue, initialCountdown) + ch <- prometheus.MustNewConstMetric(bmcWatchdogCurrentCountdownDesc, prometheus.GaugeValue, currentCountdown) + return 1, nil +} diff --git a/config.go b/config.go index b307b45..494e47e 100644 --- a/config.go +++ b/config.go @@ -76,6 +76,8 @@ func (c CollectorName) GetInstance() (collector, error) { return IPMICollector{}, nil case BMCCollectorName: return BMCCollector{}, nil + case BMCWatchdogCollectorName: + return BMCWatchdogCollector{}, nil case SELCollectorName: return SELCollector{}, nil case DCMICollectorName: diff --git a/freeipmi/freeipmi.go b/freeipmi/freeipmi.go index 1591c58..0b717b8 100644 --- a/freeipmi/freeipmi.go +++ b/freeipmi/freeipmi.go @@ -33,16 +33,24 @@ import ( ) var ( - ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(`^Power Measurement\s*:\s*(?PActive|Not\sAvailable).*`) - ipmiDCMICurrentPowerRegex = regexp.MustCompile(`^Current Power\s*:\s*(?P[0-9.]*)\s*Watts.*`) - ipmiChassisPowerRegex = regexp.MustCompile(`^System Power\s*:\s(?P.*)`) - ipmiChassisDriveFaultRegex = regexp.MustCompile(`^Drive Fault\s*:\s(?P.*)`) - ipmiChassisCoolingFaultRegex = regexp.MustCompile(`^Cooling/fan fault\s*:\s(?P.*)`) - ipmiSELEntriesRegex = regexp.MustCompile(`^Number of log entries\s*:\s(?P[0-9.]*)`) - ipmiSELFreeSpaceRegex = regexp.MustCompile(`^Free space remaining\s*:\s(?P[0-9.]*)\s*bytes.*`) - bmcInfoFirmwareRevisionRegex = regexp.MustCompile(`^Firmware Revision\s*:\s*(?P[0-9.]*).*`) - bmcInfoSystemFirmwareVersionRegex = regexp.MustCompile(`^System Firmware Version\s*:\s*(?P[0-9.]*).*`) - bmcInfoManufacturerIDRegex = regexp.MustCompile(`^Manufacturer ID\s*:\s*(?P.*)`) + ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(`^Power Measurement\s*:\s*(?PActive|Not\sAvailable).*`) + ipmiDCMICurrentPowerRegex = regexp.MustCompile(`^Current Power\s*:\s*(?P[0-9.]*)\s*Watts.*`) + ipmiChassisPowerRegex = regexp.MustCompile(`^System Power\s*:\s(?P.*)`) + ipmiChassisDriveFaultRegex = regexp.MustCompile(`^Drive Fault\s*:\s(?P.*)`) + ipmiChassisCoolingFaultRegex = regexp.MustCompile(`^Cooling/fan fault\s*:\s(?P.*)`) + ipmiSELEntriesRegex = regexp.MustCompile(`^Number of log entries\s*:\s(?P[0-9.]*)`) + ipmiSELFreeSpaceRegex = regexp.MustCompile(`^Free space remaining\s*:\s(?P[0-9.]*)\s*bytes.*`) + bmcInfoFirmwareRevisionRegex = regexp.MustCompile(`^Firmware Revision\s*:\s*(?P[0-9.]*).*`) + bmcInfoSystemFirmwareVersionRegex = regexp.MustCompile(`^System Firmware Version\s*:\s*(?P[0-9.]*).*`) + bmcInfoManufacturerIDRegex = regexp.MustCompile(`^Manufacturer ID\s*:\s*(?P.*)`) + bmcWatchdogTimerStateRegex = regexp.MustCompile(`^Timer:\s*(?PRunning|Stopped)`) + bmcWatchdogTimerUseRegex = regexp.MustCompile(`^Timer Use:\s*(?P.*)`) + bmcWatchdogTimerLoggingRegex = regexp.MustCompile(`^Logging:\s*(?PEnabled|Disabled)`) + bmcWatchdogTimeoutActionRegex = regexp.MustCompile(`^Timeout Action:\s*(?P.*)`) + bmcWatchdogPretimeoutInterruptRegex = regexp.MustCompile(`^Pre-Timeout Interrupt:\s*(?P.*)`) + bmcWatchdogPretimeoutIntervalRegex = regexp.MustCompile(`^Pre-Timeout Interval:\s*(?P[0-9.]*)\s*seconds.*`) + bmcWatchdogInitialCountdownRegex = regexp.MustCompile(`^Initial Countdown:\s*(?P[0-9.]*)\s*seconds.*`) + bmcWatchdogCurrentCountdownRegex = regexp.MustCompile(`^Current Countdown:\s*(?P[0-9.]*)\s*seconds.*`) ) // Result represents the outcome of a call to one of the FreeIPMI tools. @@ -327,3 +335,85 @@ func GetRawOctets(ipmiOutput Result) ([]string, error) { octets := strings.Split(strOutput[6:], " ") return octets, nil } + +func GetBMCWatchdogTimerState(ipmiOutput Result) (float64, error) { + if ipmiOutput.err != nil { + return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output) + } + value, err := getValue(ipmiOutput.output, bmcWatchdogTimerStateRegex) + if err != nil { + return -1, err + } + if value == "Running" { + return 1, err + } + return 0, err +} + +func GetBMCWatchdogTimerUse(ipmiOutput Result) (string, error) { + if ipmiOutput.err != nil { + return "", fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output) + } + return getValue(ipmiOutput.output, bmcWatchdogTimerUseRegex) +} + +func GetBMCWatchdogLoggingState(ipmiOutput Result) (float64, error) { + if ipmiOutput.err != nil { + return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output) + } + value, err := getValue(ipmiOutput.output, bmcWatchdogTimerLoggingRegex) + if err != nil { + return -1, err + } + if value == "Enabled" { + return 1, err + } + return 0, err +} + +func GetBMCWatchdogTimeoutAction(ipmiOutput Result) (string, error) { + if ipmiOutput.err != nil { + return "", fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output) + } + return getValue(ipmiOutput.output, bmcWatchdogTimeoutActionRegex) +} + +func GetBMCWatchdogPretimeoutInterrupt(ipmiOutput Result) (string, error) { + if ipmiOutput.err != nil { + return "", fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output) + } + return getValue(ipmiOutput.output, bmcWatchdogPretimeoutInterruptRegex) +} + +func GetBMCWatchdogPretimeoutInterval(ipmiOutput Result) (float64, error) { + if ipmiOutput.err != nil { + return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output) + } + value, err := getValue(ipmiOutput.output, bmcWatchdogPretimeoutIntervalRegex) + if err != nil { + return -1, err + } + return strconv.ParseFloat(value, 64) +} + +func GetBMCWatchdogInitialCountdown(ipmiOutput Result) (float64, error) { + if ipmiOutput.err != nil { + return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output) + } + value, err := getValue(ipmiOutput.output, bmcWatchdogInitialCountdownRegex) + if err != nil { + return -1, err + } + return strconv.ParseFloat(value, 64) +} + +func GetBMCWatchdogCurrentCountdown(ipmiOutput Result) (float64, error) { + if ipmiOutput.err != nil { + return -1, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output) + } + value, err := getValue(ipmiOutput.output, bmcWatchdogCurrentCountdownRegex) + if err != nil { + return -1, err + } + return strconv.ParseFloat(value, 64) +} From 46a9390d2c49b5bf0d1feffd1d935b0a88dc6877 Mon Sep 17 00:00:00 2001 From: Erdem Agaoglu Date: Tue, 7 Nov 2023 15:11:13 +0300 Subject: [PATCH 2/2] Documentation for bmc-watchdog Signed-off-by: Erdem Agaoglu --- docs/metrics.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++++ ipmi_local.yml | 2 +- ipmi_remote.yml | 3 ++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/docs/metrics.md b/docs/metrics.md index 5df88ea..4daacc5 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -13,6 +13,8 @@ These metrics provide data about the scrape itself: power consumption metrics (see below) will not be available - `bmc`: collects BMC details. If it fails, BMC info metrics (see below) will not be available + - `bmc-watchdog`: collects status of the watchdog. If it fails, BMC watchdog + metrics (see below) will not be available - `chassis`: collects the current chassis power state (on/off). If it fails, the chassis power state metric (see below) will not be available - `sel`: collects system event log (SEL) details. If it fails, SEL metrics @@ -36,6 +38,56 @@ version). Example: **Note:** some systems do not expose the system's firmware version, in which case it will be exported as `"N/A"`. +## BMC Watchdog + +These metrics are only provided if the `bmc-watchdog` collector is enabled. + +The metric `ipmi_bmc_watchdog_timer_state` shows whether the watchdog timer is +currently running (1) or stopped (0). + +The metric `ipmi_bmc_watchdog_timer_use_state` shows which timer use is +currently active. Per freeipmi bmc-watchdog manual there are 5 uses. This metric +will return 1 for only one of those and 0 for the rest. + + ipmi_bmc_watchdog_timer_use_state{name="BIOS FRB2"} 1 + ipmi_bmc_watchdog_timer_use_state{name="BIOS POST"} 0 + ipmi_bmc_watchdog_timer_use_state{name="OEM"} 0 + ipmi_bmc_watchdog_timer_use_state{name="OS LOAD"} 0 + ipmi_bmc_watchdog_timer_use_state{name="SMS/OS"} 0 + +The metric `ipmi_bmc_watchdog_logging_state` shows whether the watchdog logging +is enabled (1) or not (0). (Note: This is reversed in freeipmi where 0 enables +logging and 1 disables it) + +The metric `ipmi_bmc_watchdog_timeout_action_state` shows whether watchdog will +take an action on timeout, and if so which one. Per freeipmi bmc-watchdog manual +there are 3 actions. If no action is configured it will be reported as `None`. + + ipmi_bmc_watchdog_timeout_action_state{action="Hard Reset"} 0 + ipmi_bmc_watchdog_timeout_action_state{action="None"} 0 + ipmi_bmc_watchdog_timeout_action_state{action="Power Cycle"} 1 + ipmi_bmc_watchdog_timeout_action_state{action="Power Down"} 0 + +The metric `ipmi_bmc_watchdog_timeout_action_state` shows whether a pre-timeout +interrupt is currently active and if so, which one. Per freeipmi bmc-watchdog +manual there are 3 interrupts. If no interrupt is configured it will be reported +as `None`. + + ipmi_bmc_watchdog_pretimeout_interrupt_state{interrupt="Messaging Interrupt"} 0 + ipmi_bmc_watchdog_pretimeout_interrupt_state{interrupt="NMI / Diagnostic Interrupt"} 0 + ipmi_bmc_watchdog_pretimeout_interrupt_state{interrupt="None"} 1 + ipmi_bmc_watchdog_pretimeout_interrupt_state{interrupt="SMI"} 0 + +The metric `ipmi_bmc_watchdog_pretimeout_interval_seconds` shows the current +pre-timeout interval as measured in seconds. + +The metric `ipmi_bmc_watchdog_initial_countdown_seconds` shows the configured +countdown in seconds. + +The metric `ipmi_bmc_watchdog_current_countdown_seconds` shows the current +countdown in seconds. + + ## Chassis Power State This metric is only provided if the `chassis` collector is enabled. diff --git a/ipmi_local.yml b/ipmi_local.yml index d43de40..26404ee 100644 --- a/ipmi_local.yml +++ b/ipmi_local.yml @@ -4,7 +4,7 @@ # In most cases, this should work without using a config file at all. modules: default: - # Available collectors are bmc, ipmi, chassis, dcmi, sel, and sm-lan-mode + # Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel, and sm-lan-mode collectors: - bmc - ipmi diff --git a/ipmi_remote.yml b/ipmi_remote.yml index 799445d..ce9f713 100644 --- a/ipmi_remote.yml +++ b/ipmi_remote.yml @@ -22,7 +22,8 @@ modules: # timeout in Prometheus accordingly. # Must be larger than the retransmission timeout, which defaults to 1000. timeout: 10000 - # Available collectors are bmc, ipmi, chassis, dcmi, sel, and sm-lan-mode + # Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel, + # and sm-lan-mode # If _not_ specified, bmc, ipmi, chassis, and dcmi are used collectors: - bmc