Skip to content

Commit

Permalink
Merge pull request #179 from alexandrovas/feat/sel-events
Browse files Browse the repository at this point in the history
feat: custom SEL events metrics
  • Loading branch information
bitfehler authored Jan 23, 2024
2 parents 3853e45 + 45ff1b4 commit d4398a6
Show file tree
Hide file tree
Showing 6 changed files with 325 additions and 94 deletions.
141 changes: 141 additions & 0 deletions collector_sel_events.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"time"

"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"

"github.com/prometheus-community/ipmi_exporter/freeipmi"
)

const (
SELEventsCollectorName CollectorName = "sel-events"
)

var (
selEventsCountByStateDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "sel_events", "count_by_state"),
"Current number of log entries in the SEL by state.",
[]string{"state"},
nil,
)
selEventsCountByNameDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "sel_events", "count_by_name"),
"Current number of custom log entries in the SEL by name.",
[]string{"name"},
nil,
)
selEventsLatestTimestampDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "sel_events", "latest_timestamp"),
"Latest timestamp of custom log entries in the SEL by name.",
[]string{"name"},
nil,
)
)

type SELEventsCollector struct{}

func (c SELEventsCollector) Name() CollectorName {
return SELEventsCollectorName
}

func (c SELEventsCollector) Cmd() string {
return "ipmi-sel"
}

func (c SELEventsCollector) Args() []string {
return []string{
"-Q",
"--comma-separated-output",
"--no-header-output",
"--sdr-cache-recreate",
"--output-event-state",
"--interpret-oem-data",
"--entity-sensor-names",
}
}

func (c SELEventsCollector) Collect(result freeipmi.Result, ch chan<- prometheus.Metric, target ipmiTarget) (int, error) {
selEventConfigs := target.config.SELEvents

events, err := freeipmi.GetSELEvents(result)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect SEL events", "target", targetName(target.host), "error", err)
return 0, err
}

selEventByStateCount := map[string]float64{}
selEventByNameCount := map[string]float64{}
selEventByNameTimestamp := map[string]float64{}

// initialize sel event metrics by zero
for _, metricConfig := range selEventConfigs {
selEventByNameTimestamp[metricConfig.Name] = 0
selEventByNameCount[metricConfig.Name] = 0
}

for _, data := range events {
for _, metricConfig := range selEventConfigs {
match := metricConfig.Regex.FindStringSubmatch(data.Event)
if match != nil {
t, err := time.Parse("Jan-02-2006 15:04:05", data.Date+" "+data.Time)
if err != nil {
level.Error(logger).Log("msg", "Failed to collect SEL event metrics", "target", targetName(target.host), "error", err)
return 0, err
}
newTimestamp := float64(t.Unix())
// save latest timestamp by name metrics
if newTimestamp > selEventByNameTimestamp[metricConfig.Name] {
selEventByNameTimestamp[metricConfig.Name] = newTimestamp
}
// save count by name metrics
selEventByNameCount[metricConfig.Name]++
}
}
// save count by state metrics
_, ok := selEventByStateCount[data.State]
if !ok {
selEventByStateCount[data.State] = 0
}
selEventByStateCount[data.State]++
}

for state, value := range selEventByStateCount {
ch <- prometheus.MustNewConstMetric(
selEventsCountByStateDesc,
prometheus.GaugeValue,
value,
state,
)
}

for name, value := range selEventByNameCount {
ch <- prometheus.MustNewConstMetric(
selEventsCountByNameDesc,
prometheus.GaugeValue,
value,
name,
)
ch <- prometheus.MustNewConstMetric(
selEventsLatestTimestampDesc,
prometheus.GaugeValue,
selEventByNameTimestamp[name],
name,
)
}
return 1, nil
}
13 changes: 13 additions & 0 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package main
import (
"fmt"
"os"
"regexp"
"strings"
"sync"

Expand Down Expand Up @@ -80,6 +81,8 @@ func (c CollectorName) GetInstance() (collector, error) {
return BMCWatchdogCollector{}, nil
case SELCollectorName:
return SELCollector{}, nil
case SELEventsCollectorName:
return SELEventsCollector{}, nil
case DCMICollectorName:
return DCMICollector{}, nil
case ChassisCollectorName:
Expand Down Expand Up @@ -124,10 +127,17 @@ type IPMIConfig struct {
CollectorArgs map[CollectorName][]string `yaml:"default_args"`
CustomArgs map[CollectorName][]string `yaml:"custom_args"`

SELEvents []*IpmiSELEvent `yaml:"sel_events,omitempty"`
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}

type IpmiSELEvent struct {
Name string `yaml:"name"`
RegexRaw string `yaml:"regex"`
Regex *regexp.Regexp `yaml:"-"`
}

var defaultConfig = IPMIConfig{
Collectors: []CollectorName{IPMICollectorName, DCMICollectorName, BMCCollectorName, ChassisCollectorName},
}
Expand Down Expand Up @@ -170,6 +180,9 @@ func (s *IPMIConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
return err
}
}
for _, selEvent := range s.SELEvents {
selEvent.Regex = regexp.MustCompile(selEvent.RegexRaw)
}
return nil
}

Expand Down
20 changes: 19 additions & 1 deletion docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ These metrics provide data about the scrape itself:
the chassis power state metric (see below) will not be available
- `sel`: collects system event log (SEL) details. If it fails, SEL metrics
(see below) will not be available
- `sel-events`: collects metrics for user-defined events in system event log
(SEL). If it fails, SEL entries metrics (see below) will not be available
- `sm-lan-mode`: collects the "LAN mode" setting in the current BMC config.
If it fails, the LAN mode metric (see below) will not be available
- `ipmi_scrape_duration_seconds` is the amount of time it took to retrieve the
Expand Down Expand Up @@ -87,7 +89,6 @@ countdown in seconds.
The metric `ipmi_bmc_watchdog_current_countdown_seconds` shows the current
countdown in seconds.


## Chassis Power State

This metric is only provided if the `chassis` collector is enabled.
Expand Down Expand Up @@ -116,6 +117,23 @@ no labels.
The metric `ipmi_sel_free_space_bytes` contains the current number of free
space for new SEL entries, in bytes. This metric has no labels.

## System event log (SEL) entries metrics

These metrics are only provided if the `sel-events` collector is enabled (it
isn't by default).

For each event specified in the configuration file (`sel_events` field), will be
generated metrics containing the number of such events and the timestamp of their
last occurrence. Example:

ipmi_sel_events_count_by_name{name="my_custom_event_from_config"} 77
ipmi_sel_events_latest_timestamp{name="my_custom_event_from_config"} 1.703613275e+09

also next aggregated metrics will be exported:

ipmi_sel_events_count_by_state{state="Nominal"} 10
ipmi_sel_events_count_by_state{state="Warning"} 5

## Supermicro LAN mode setting

This metric is only provided if the `sm-lan-mode` collector is enabled (it
Expand Down
54 changes: 54 additions & 0 deletions freeipmi/freeipmi.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
package freeipmi

import (
"bufio"
"bytes"
"crypto/rand"
"encoding/csv"
Expand All @@ -40,6 +41,7 @@ var (
ipmiChassisCoolingFaultRegex = regexp.MustCompile(`^Cooling/fan fault\s*:\s(?P<value>.*)`)
ipmiSELEntriesRegex = regexp.MustCompile(`^Number of log entries\s*:\s(?P<value>[0-9.]*)`)
ipmiSELFreeSpaceRegex = regexp.MustCompile(`^Free space remaining\s*:\s(?P<value>[0-9.]*)\s*bytes.*`)
ipmiSELEventRegex = regexp.MustCompile(`^(?P<id>[0-9]+),\s*(?P<date>[^,]*),(?P<time>[^,]*),(?P<name>[^,]*),(?P<type>[^,]*),(?P<state>[^,]*),(?P<event>[^,]*)$`)
bmcInfoFirmwareRevisionRegex = regexp.MustCompile(`^Firmware Revision\s*:\s*(?P<value>[0-9.]*).*`)
bmcInfoSystemFirmwareVersionRegex = regexp.MustCompile(`^System Firmware Version\s*:\s*(?P<value>[0-9.]*).*`)
bmcInfoManufacturerIDRegex = regexp.MustCompile(`^Manufacturer ID\s*:\s*(?P<value>.*)`)
Expand Down Expand Up @@ -71,6 +73,17 @@ type SensorData struct {
Event string
}

// SELEvent represents log line from SEL
type SELEventData struct {
ID int64
Date string
Time string
Name string
Type string
State string
Event string
}

// EscapePassword escapes a password so that the result is suitable for usage in a
// FreeIPMI config file.
func EscapePassword(password string) string {
Expand Down Expand Up @@ -417,3 +430,44 @@ func GetBMCWatchdogCurrentCountdown(ipmiOutput Result) (float64, error) {
}
return strconv.ParseFloat(value, 64)
}

func GetSELEvents(ipmiOutput Result) ([]SELEventData, error) {
if ipmiOutput.err != nil {
return nil, fmt.Errorf("%s: %s", ipmiOutput.err, ipmiOutput.output)
}

scanner := bufio.NewScanner(bytes.NewReader(ipmiOutput.output))
events := []SELEventData{}
for scanner.Scan() {
line := scanner.Text()
match := ipmiSELEventRegex.FindStringSubmatch(line)
// ignore lines which does not matches event regexp
if match == nil {
continue
}

result := make(map[string]string)
for i, name := range ipmiSELEventRegex.SubexpNames() {
if i != 0 && name != "" {
result[name] = match[i]
}
}
id, err := strconv.ParseInt(result["id"], 10, 64)

// ignore lines which does not starts with number
if err != nil {
continue
}

events = append(events, SELEventData{
ID: id,
Date: result["date"],
Time: result["time"],
Name: result["name"],
Type: result["type"],
State: result["state"],
Event: result["event"],
})
}
return events, nil
}
31 changes: 18 additions & 13 deletions ipmi_local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,21 @@
# This is an example config for scraping the local host.
# In most cases, this should work without using a config file at all.
modules:
default:
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel, and sm-lan-mode
collectors:
- bmc
- ipmi
- dcmi
- chassis
- sel
# Got any sensors you don't care about? Add them here.
exclude_sensor_ids:
- 2
- 29
- 32
default:
# Available collectors are bmc, bmc-watchdog, ipmi, chassis, dcmi, sel, sel-events and sm-lan-mode
collectors:
- bmc
- ipmi
- dcmi
- chassis
- sel
- sel-events
# Got any sensors you don't care about? Add them here.
exclude_sensor_ids:
- 2
- 29
- 32
# Define custom metrics for SEL entries
sel_events:
- name: correctable_memory_error
regex: Correctable memory error.*
Loading

0 comments on commit d4398a6

Please sign in to comment.