Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RDMA collector #3176

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions collector/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ func readUintFromFile(path string) (uint64, error) {
return value, nil
}

func readStringFromFile(path string) string {
data, err := os.ReadFile(path)
if err != nil {
return ""
}
return strings.TrimSpace(string(data))
}

var metricNameRegex = regexp.MustCompile(`_*[^0-9A-Za-z_]+_*`)

// SanitizeMetricName sanitize the given metric name by replacing invalid characters by underscores.
Expand Down
266 changes: 266 additions & 0 deletions collector/rdma_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !nordma
// +build !nordma

// The hard work of collecting data from the kernel via the MLNX_OFED interfaces is done by
// https://github.com/Mellanox/rdmamap
// by Mellanox. Used under the Apache 2.0 license.

package collector

import (
"fmt"
"log/slog"
"os"
"path/filepath"
"regexp"
"strings"
"sync"

"github.com/Mellanox/rdmamap"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus/client_golang/prometheus"
)

var (
rdmaDeviceInclude = kingpin.Flag("collector.rdma.device-include", "Regexp of rdma devices to include (mutually exclusive to device-exclude).").String()
rdmaDeviceExclude = kingpin.Flag("collector.rdma.device-exclude", "Regexp of rdma devices to exclude (mutually exclusive to device-include).").String()
rdmaIncludedMetrics = kingpin.Flag("collector.rdma.metrics-include", "Regexp of rdma stats to include.").Default(".*").String()

rdmaHwCounters = map[string]string{
"roce_slow_restart_cnps": "RDMA RoCE slow restart CNPS",
yeahdongcn marked this conversation as resolved.
Show resolved Hide resolved
"rp_cnp_ignored": "RDMA RP CNP ignored",
"roce_adp_retrans_to": "RDMA RoCE adaptive retransmission timeout",
"rx_icrc_encapsulated": "RDMA RX ICRC encapsulated",
"resp_local_length_error": "RDMA response local length error",
"np_ecn_marked_roce_packets": "RDMA NP ECN marked RoCE packets",
"roce_slow_restart_trans": "RDMA RoCE slow restart transactions",
"req_remote_invalid_request": "RDMA request remote invalid request",
"local_ack_timeout_err": "RDMA local ACK timeout error",
"lifespan": "RDMA lifespan",
"req_cqe_error": "RDMA request CQE error",
"rnr_nak_retry_err": "RDMA RNR NAK retry error",
"np_cnp_sent": "RDMA NP CNP sent",
"rx_dct_connect": "RDMA RX DCT connect",
"rp_cnp_handled": "RDMA RP CNP handled",
"implied_nak_seq_err": "RDMA implied NAK sequence error",
"roce_slow_restart": "RDMA RoCE slow restart",
"req_cqe_flush_error": "RDMA request CQE flush error",
"packet_seq_err": "RDMA packet sequence error",
"duplicate_request": "RDMA duplicate request",
"roce_adp_retrans": "RDMA RoCE adaptive retransmission",
"out_of_buffer": "RDMA out of buffer",
"resp_cqe_error": "RDMA response CQE error",
"resp_cqe_flush_error": "RDMA response CQE flush error",
"out_of_sequence": "RDMA out of sequence",
"rx_read_requests": "RDMA RX read requests",
"rx_atomic_requests": "RDMA RX atomic requests",
"req_remote_access_errors": "RDMA request remote access errors",
"rx_write_requests": "RDMA RX write requests",
"resp_remote_access_errors": "RDMA response remote access errors",
"req_transport_retries_exceeded": "RDMA request transport retries exceeded",
"req_rnr_retries_exceeded": "RDMA request RNR retries exceeded",
}
rdmaCounters = map[string]string{
"unicast_rcv_packets": "RDMA unicast received packets",
"port_xmit_data": "RDMA port transmit data",
"port_xmit_constraint_errors": "RDMA port transmit constraint errors",
"VL15_dropped": "RDMA VL15 dropped",
"port_rcv_errors": "RDMA port receive errors",
"port_xmit_wait": "RDMA port transmit wait",
"link_error_recovery": "RDMA link error recovery",
"multicast_rcv_packets": "RDMA multicast received packets",
"multicast_xmit_packets": "RDMA multicast transmitted packets",
"port_rcv_remote_physical_errors": "RDMA port receive remote physical errors",
"port_rcv_packets": "RDMA port receive packets",
"unicast_xmit_packets": "RDMA unicast transmitted packets",
"excessive_buffer_overrun_errors": "RDMA excessive buffer overrun errors",
"port_rcv_data": "RDMA port receive data",
"port_rcv_constraint_errors": "RDMA port receive constraint errors",
"link_downed": "RDMA link downed",
"local_link_integrity_errors": "RDMA local link integrity errors",
"port_xmit_discards": "RDMA port transmit discards",
"port_rcv_switch_relay_errors": "RDMA port receive switch relay errors",
"port_xmit_packets": "RDMA port transmit packets",
"symbol_error": "RDMA symbol error",
}
)

type rdmaCollector struct {
entries map[string]*prometheus.Desc
entriesMutex sync.Mutex
deviceFilter deviceFilter
infoDesc *prometheus.Desc
metricsPattern *regexp.Regexp
logger *slog.Logger
}

// makeRdmaCollector is the internal constructor for rdmaCollector.
func makeRdmaCollector(logger *slog.Logger) (*rdmaCollector, error) {
if *rdmaDeviceInclude != "" {
logger.Info("Parsed flag --collector.rdma.device-include", "flag", *rdmaDeviceInclude)
}
if *rdmaDeviceExclude != "" {
logger.Info("Parsed flag --collector.rdma.device-exclude", "flag", *rdmaDeviceExclude)
}
if *rdmaIncludedMetrics != "" {
logger.Info("Parsed flag --collector.rdma.metrics-include", "flag", *rdmaIncludedMetrics)
}

// Update paths to respect the mount points setup.
for _, dir := range []*string{
&rdmamap.RdmaClassDir,
&rdmamap.RdmaIbUcmDir,
&rdmamap.RdmaUmadDir,
&rdmamap.RdmaUverbsDir,
&rdmamap.PciDevDir,
&rdmamap.AuxDevDir,
} {
*dir = strings.TrimPrefix(*dir, "/sys")
*dir = sysFilePath(*dir)
}
for _, dir := range []*string{
&rdmamap.RdmaUcmDevice,
&rdmamap.RdmaDeviceDir,
} {
*dir = rootfsFilePath(*dir)
}

entries := make(map[string]*prometheus.Desc, len(rdmaHwCounters)+len(rdmaCounters))
for metric, help := range rdmaHwCounters {
entries[metric] = prometheus.NewDesc(
buildRdmaFQName(fmt.Sprintf("hw_%s", metric)),
help,
[]string{"device", "port", "interfaces"}, nil,
)
}
for metric, help := range rdmaCounters {
entries[metric] = prometheus.NewDesc(
buildRdmaFQName(metric),
help,
[]string{"device", "port", "interfaces"}, nil,
)
}

// Pre-populate some common rdma metrics.
return &rdmaCollector{
deviceFilter: newDeviceFilter(*rdmaDeviceExclude, *rdmaDeviceInclude),
metricsPattern: regexp.MustCompile(*rdmaIncludedMetrics),
logger: logger,
entries: entries,
infoDesc: prometheus.NewDesc(
buildRdmaFQName("info"),
"A metric with a constant '1' value labeled by device, vendor_id, device_id, firmware_version, driver_version.",
[]string{"device", "vendor_id", "device_id", "firmware_version", "driver_version"}, nil,
),
}, nil
}

func init() {
registerCollector("rdma", defaultDisabled, NewRdmaCollector)
}

// Generate the fully-qualified metric name for the rdma metric.
func buildRdmaFQName(metric string) string {
metricName := strings.TrimLeft(strings.ToLower(SanitizeMetricName(metric)), "_")
return prometheus.BuildFQName(namespace, "rdma", metricName)
}

// NewRdmaCollector returns a new Collector exposing rdma stats.
func NewRdmaCollector(logger *slog.Logger) (Collector, error) {
return makeRdmaCollector(logger)
}

func getNetworkInterfaces(rdmaDeviceName string) string {
var ifs []string

dir := filepath.Join(rdmamap.RdmaClassDir, rdmaDeviceName, "device", "net")
fd, err := os.Open(dir)
if err != nil {
return ""
}
defer fd.Close()

fileInfos, err := fd.Readdir(-1)
if err != nil {
return ""
}

for i := range fileInfos {
if fileInfos[i].Name() == "." || fileInfos[i].Name() == ".." {
continue
}
ifs = append(ifs, fileInfos[i].Name())
}
return strings.Join(ifs, ",")
}

func (c *rdmaCollector) Update(ch chan<- prometheus.Metric) error {
rdmaDevices := rdmamap.GetRdmaDeviceList()
if len(rdmaDevices) == 0 {
return fmt.Errorf("no rdma devices found")
}

for _, device := range rdmaDevices {
if c.deviceFilter.ignored(device) {
continue
}

interfaces := getNetworkInterfaces(device)

stats, err := rdmamap.GetRdmaSysfsAllPortsStats(device)
if err != nil {
c.logger.Error("rdma stats error", "err", err, "device", device)
continue
}

updateFunc := func(name string, value float64, labelValues ...string) {
if !c.metricsPattern.MatchString(name) {
return
}
entry := c.entry(name)
if entry == nil {
c.logger.Warn("rdma metric not found", "name", name)
return
}
ch <- prometheus.MustNewConstMetric(c.entry(name), prometheus.GaugeValue,
value, labelValues...)
}

for _, portstats := range stats.PortStats {
for _, stat := range portstats.HwStats {
updateFunc(stat.Name, float64(stat.Value), device, fmt.Sprintf("%d", portstats.Port), interfaces)
}
for _, stat := range portstats.Stats {
updateFunc(stat.Name, float64(stat.Value), device, fmt.Sprintf("%d", portstats.Port), interfaces)
}
}

vendorID := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, device, "device", "vendor"))
deviceID := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, device, "device", "device"))
firmwareVersion := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, "mlx5_0", "fw_ver"))
driverVersion := readStringFromFile(sysFilePath("module/mlx5_core/version"))
ch <- prometheus.MustNewConstMetric(c.infoDesc, prometheus.GaugeValue, 1.0,
device, vendorID, deviceID, firmwareVersion, driverVersion)
}

return nil
}

func (c *rdmaCollector) entry(key string) *prometheus.Desc {
c.entriesMutex.Lock()
defer c.entriesMutex.Unlock()
return c.entries[key]
}
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/prometheus/node_exporter
go 1.22.0

require (
github.com/Mellanox/rdmamap v1.1.1-0.20241212105033-37bd11cc4c57
github.com/alecthomas/kingpin/v2 v2.4.0
github.com/beevik/ntp v1.4.3
github.com/coreos/go-systemd/v22 v22.5.0
Expand Down Expand Up @@ -48,6 +49,8 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973 // indirect
github.com/vishvananda/netlink v1.1.0 // indirect
github.com/vishvananda/netns v0.0.4 // indirect
github.com/xhit/go-str2duration/v2 v2.1.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
github.com/Mellanox/rdmamap v1.1.1-0.20241212105033-37bd11cc4c57 h1:ffMnYJFt7Bgp/2s2fOsQ0LpKfuU4xCk4afAtQG1wuBM=
github.com/Mellanox/rdmamap v1.1.1-0.20241212105033-37bd11cc4c57/go.mod h1:D3ffy5KqtmeWfuW0cX/GQW0J6S3k8aORk4bf9CBOhng=
github.com/alecthomas/kingpin/v2 v2.4.0 h1:f48lwail6p8zpO1bC4TxtqACaGqHYA22qkHjHpqDjYY=
github.com/alecthomas/kingpin/v2 v2.4.0/go.mod h1:0gyi0zQnjuFk8xrkNKamJoyUo382HRL7ATRpFZCw6tE=
github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc=
Expand Down Expand Up @@ -96,6 +98,11 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0=
github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE=
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU=
github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8=
github.com/vishvananda/netns v0.0.4/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM=
github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc=
github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU=
go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
Expand All @@ -112,6 +119,7 @@ golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE=
golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20211031064116-611d5d643895/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
Expand Down
Loading