From 35dbec2263c6db74346f57262b11e3459c0185a3 Mon Sep 17 00:00:00 2001 From: Diego Asturias Date: Tue, 10 Dec 2024 01:24:02 +0000 Subject: [PATCH] feat: Collect /sys/net/class PCIE AER counters Signed-off-by: Diego Asturias --- collector/aer_linux.go | 252 +++++++++++++++++++++++++++++++++++++++++ go.mod | 2 + go.sum | 4 +- 3 files changed, 256 insertions(+), 2 deletions(-) create mode 100644 collector/aer_linux.go diff --git a/collector/aer_linux.go b/collector/aer_linux.go new file mode 100644 index 0000000000..c0855bf4ef --- /dev/null +++ b/collector/aer_linux.go @@ -0,0 +1,252 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//go:build !nonetclass && linux +// +build !nonetclass,linux + +package collector + +import ( + "errors" + "fmt" + "log/slog" + "os" + "regexp" + "strconv" + + "github.com/alecthomas/kingpin/v2" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/procfs/sysfs" +) + +var ( + aerIgnoredDevices = kingpin.Flag("collector.aer.ignored-devices", "Regexp of aer devices to ignore for aer collector.").Default("^$").String() + aerCorrectableRxErr = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_rx_err"), + "Count of correctable receiver errors", + []string{"interface"}, nil, + ) + aerCorrectableBadTLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_bad_tlp"), + "Count of correctable bad TLPs", + []string{"interface"}, nil, + ) + aerCorrectableBadDLLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_bad_dllp"), + "Count of correctable bad DLLPs", + []string{"interface"}, nil, + ) + aerCorrectableRollover = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_rollover"), + "Count of correctable rollovers", + []string{"interface"}, nil, + ) + aerCorrectableTimeout = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_timeout"), + "Count of correctable replay timer timeouts", + []string{"interface"}, nil, + ) + aerCorrectableNonFatalErr = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_non_fatal_err"), + "Count of correctable advisory non-fatal errors", + []string{"interface"}, nil, + ) + aerCorrectableCorrIntErr = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_corr_int_err"), + "Count of correctable corrected internal errors", + []string{"interface"}, nil, + ) + aerCorrectableHeaderOF = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_header_of"), + "Count of correctable header log Overflows", + []string{"interface"}, nil, + ) + aerUncorrectableUndefined = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_undefined"), + "Count of uncorrectable undefined errors", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableDLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_dlp"), + "Count of uncorrectable data link protocol errors", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableSDES = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_sdes"), + "Count of uncorrectable surprise down errors", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableTLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_tlp"), + "Count of uncorrectable poisoned TLPs", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableFCP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_fcp"), + "Count of uncorrectable flow control protocol errors", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableCmpltTO = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_cmplt_to"), + "Count of uncorrectable completion timeouts", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableCmpltAbrt = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_cmplt_abrt"), + "Count of uncorrectable completer aborts", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableUnxCmplt = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_unx_cmplt"), + "Count of uncorrectable unexpected completion errors", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableRxOF = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_rx_of"), + "Count of uncorrectable receiver overflows", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableMalfTLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_malf_tlp"), + "Count of uncorrectable malformed TLPs", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableECRC = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_ecrc"), + "Count of uncorrectable ECRCs", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableUnsupReq = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_unsup_req"), + "Count of uncorrectable unsupported requests", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableACSViol = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_acs_viol"), + "Count of uncorrectable ACS violations", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableUncorrIntErr = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_uncorr_int_err"), + "Count of uncorrectable uncorrectable internal errors", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableBlockedTLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_blocked_tlp"), + "Count of uncorrectable MC blocked TLPs", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableAtomicOpBlocked = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_atomic_op_blocked"), + "Count of uncorrectable AtomicOp egress blocked errors", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectableTLPBlockedErr = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_tlp_blocked_err"), + "Count of uncorrectable TLP prefix blocked errors", + []string{"interface", "fatal"}, nil, + ) + aerUncorrectablePoisonTLPBlocked = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_poison_tlp_blocked"), + "Count of uncorrectable poison TLP prefix blocked errors", + []string{"interface", "fatal"}, nil, + ) +) + +type aerCollector struct { + fs sysfs.FS + ignoredDevicesPattern *regexp.Regexp + logger *slog.Logger +} + +func init() { + registerCollector("aer", defaultDisabled, NewAerCollector) +} + +// NewAerCollector returns a new Collector exposing aer stats. +func NewAerCollector(logger *slog.Logger) (Collector, error) { + return makeAerCollector(logger) +} + +func makeAerCollector(logger *slog.Logger) (*aerCollector, error) { + fs, err := sysfs.NewFS(*sysPath) + if err != nil { + return nil, fmt.Errorf("failed to open sysfs: %w", err) + } + + if *aerIgnoredDevices != "" { + logger.Info("Parsed flag --collector.aer.ignored-devices", "flag", *aerIgnoredDevices) + } + pattern := regexp.MustCompile(*aerIgnoredDevices) + return &aerCollector{ + fs: fs, + ignoredDevicesPattern: pattern, + logger: logger, + }, nil + +} + +func (c *aerCollector) Update(ch chan<- prometheus.Metric) error { + counters, err := c.fs.AerCounters() + if err != nil { + if errors.Is(err, os.ErrNotExist) || errors.Is(err, os.ErrPermission) { + c.logger.Debug("Could not read netclass file", "err", err) + return ErrNoData + } + return fmt.Errorf("could not get net class info: %w", err) + } + + for deviceName, deviceCounters := range counters { + if c.ignoredDevicesPattern.MatchString(deviceName) { + continue + } + + c.updateCorrectableCntrs(ch, deviceName, deviceCounters.Correctable) + c.updateUncorrectableCntrs(ch, deviceName, deviceCounters.Fatal, true) + c.updateUncorrectableCntrs(ch, deviceName, deviceCounters.NonFatal, false) + + } + return nil +} + +func (c *aerCollector) updateCorrectableCntrs(ch chan<- prometheus.Metric, deviceName string, counters sysfs.CorrectableAerCounters) { + ch <- prometheus.MustNewConstMetric(aerCorrectableRxErr, prometheus.CounterValue, + float64(counters.RxErr), deviceName) + ch <- prometheus.MustNewConstMetric(aerCorrectableBadTLP, prometheus.CounterValue, + float64(counters.BadTLP), deviceName) + ch <- prometheus.MustNewConstMetric(aerCorrectableBadDLLP, prometheus.CounterValue, + float64(counters.BadDLLP), deviceName) + ch <- prometheus.MustNewConstMetric(aerCorrectableRollover, prometheus.CounterValue, + float64(counters.Rollover), deviceName) + ch <- prometheus.MustNewConstMetric(aerCorrectableTimeout, prometheus.CounterValue, + float64(counters.Timeout), deviceName) + ch <- prometheus.MustNewConstMetric(aerCorrectableNonFatalErr, prometheus.CounterValue, + float64(counters.NonFatalErr), deviceName) + ch <- prometheus.MustNewConstMetric(aerCorrectableCorrIntErr, prometheus.CounterValue, + float64(counters.CorrIntErr), deviceName) + ch <- prometheus.MustNewConstMetric(aerCorrectableHeaderOF, prometheus.CounterValue, + float64(counters.HeaderOF), deviceName) +} + +func (c *aerCollector) updateUncorrectableCntrs(ch chan<- prometheus.Metric, deviceName string, counters sysfs.UncorrectableAerCounters, fatal bool) { + ch <- prometheus.MustNewConstMetric(aerUncorrectableUndefined, prometheus.CounterValue, + float64(counters.Undefined), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableDLP, prometheus.CounterValue, + float64(counters.DLP), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableSDES, prometheus.CounterValue, + float64(counters.SDES), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableTLP, prometheus.CounterValue, + float64(counters.TLP), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableFCP, prometheus.CounterValue, + float64(counters.FCP), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableCmpltTO, prometheus.CounterValue, + float64(counters.CmpltTO), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableCmpltAbrt, prometheus.CounterValue, + float64(counters.CmpltAbrt), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableUnxCmplt, prometheus.CounterValue, + float64(counters.UnxCmplt), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableRxOF, prometheus.CounterValue, + float64(counters.RxOF), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableMalfTLP, prometheus.CounterValue, + float64(counters.MalfTLP), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableECRC, prometheus.CounterValue, + float64(counters.ECRC), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableUnsupReq, prometheus.CounterValue, + float64(counters.UnsupReq), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableACSViol, prometheus.CounterValue, + float64(counters.ACSViol), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableUncorrIntErr, prometheus.CounterValue, + float64(counters.UncorrIntErr), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableBlockedTLP, prometheus.CounterValue, + float64(counters.BlockedTLP), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableAtomicOpBlocked, prometheus.CounterValue, + float64(counters.AtomicOpBlocked), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectableTLPBlockedErr, prometheus.CounterValue, + float64(counters.TLPBlockedErr), deviceName, strconv.FormatBool(fatal)) + ch <- prometheus.MustNewConstMetric(aerUncorrectablePoisonTLPBlocked, prometheus.CounterValue, + float64(counters.PoisonTLPBlocked), deviceName, strconv.FormatBool(fatal)) +} diff --git a/go.mod b/go.mod index e316246428..8edd201b3a 100644 --- a/go.mod +++ b/go.mod @@ -59,3 +59,5 @@ require ( google.golang.org/protobuf v1.34.2 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) + +replace github.com/prometheus/procfs => github.com/dasturiasArista/procfs v1.0.2 diff --git a/go.sum b/go.sum index 9230727412..1bc061e2c7 100644 --- a/go.sum +++ b/go.sum @@ -12,6 +12,8 @@ github.com/cilium/ebpf v0.12.3 h1:8ht6F9MquybnY97at+VDZb3eQQr8ev79RueWeVaEcG4= github.com/cilium/ebpf v0.12.3/go.mod h1:TctK1ivibvI3znr66ljgi4hqOT8EYQjz1KWBfb1UVgM= github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/dasturiasArista/procfs v1.0.2 h1:vNFucKeBondLfu8afp7KAar4dkAO0GuoFEuuyt96ZZM= +github.com/dasturiasArista/procfs v1.0.2/go.mod h1:S2aFqsiJkGSmNV1vWOVVkGk3LOZXC6lHs6Gjq9u9v5g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -83,8 +85,6 @@ github.com/prometheus/common v0.60.1 h1:FUas6GcOw66yB/73KC+BOZoFJmbo/1pojoILArPA github.com/prometheus/common v0.60.1/go.mod h1:h0LYf1R1deLSKtD4Vdg8gy4RuOvENW2J/h19V5NADQw= github.com/prometheus/exporter-toolkit v0.13.1 h1:Evsh0gWQo2bdOHlnz9+0Nm7/OFfIwhE2Ws4A2jIlR04= github.com/prometheus/exporter-toolkit v0.13.1/go.mod h1:ujdv2YIOxtdFxxqtloLpbqmxd5J0Le6IITUvIRSWjj0= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/safchain/ethtool v0.4.1 h1:S6mEleTADqgynileXoiapt/nKnatyR6bmIHoF+h2ADo=