Skip to content

Commit

Permalink
feat: expose MD raid component devices
Browse files Browse the repository at this point in the history
Expose what component devices are part of a MD raid device, as well as the most
common flags per-component. This will enable a future node_exporter metric
showing which component of a RAID had failed.

Signed-off-by: Robin H. Johnson <[email protected]>
Signed-off-by: Robin H. Johnson <[email protected]>
  • Loading branch information
robbat2 committed Oct 3, 2024
1 parent cd2a69e commit dca5b72
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 41 deletions.
96 changes: 74 additions & 22 deletions mdstat.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,28 @@ var (
recoveryLinePctRE = regexp.MustCompile(`= (.+)%`)
recoveryLineFinishRE = regexp.MustCompile(`finish=(.+)min`)
recoveryLineSpeedRE = regexp.MustCompile(`speed=(.+)[A-Z]`)
componentDeviceRE = regexp.MustCompile(`(.*)\[\d+\]`)
componentDeviceRE = regexp.MustCompile(`(.*)\[(\d+)\](\([SF]+\))?`)
personalitiesPrefix = "Personalities : "
)

type MDStatComponent struct {
// Name of the component device.
Name string
// DescriptorIndex number of component device, e.g. the order in the superblock.
DescriptorIndex int32
// Flags per Linux drivers/md/md.[ch] as of v6.12-rc1
// Subset that are exposed in mdstat
WriteMostly bool
Journal bool
Faulty bool // "Faulty" is what kernel source uses for "(F)"
Spare bool
Replacement bool
// Some additional flags that are NOT exposed in procfs today; they may
// be available via sysfs.
// In_sync, Bitmap_sync, Blocked, WriteErrorSeen, FaultRecorded,
// BlockedBadBlocks, WantReplacement, Candidate, ...
}

// MDStat holds info parsed from /proc/mdstat.
type MDStat struct {
// Name of the device.
Expand Down Expand Up @@ -60,8 +79,8 @@ type MDStat struct {
BlocksSyncedFinishTime float64
// current sync speed (in Kilobytes/sec)
BlocksSyncedSpeed float64
// Name of md component devices
Devices []string
// component devices
Devices []MDStatComponent
}

// MDStat parses an mdstat-file (/proc/mdstat) and returns a slice of
Expand All @@ -82,38 +101,52 @@ func (fs FS) MDStat() ([]MDStat, error) {
// parseMDStat parses data from mdstat file (/proc/mdstat) and returns a slice of
// structs containing the relevant info.
func parseMDStat(mdStatData []byte) ([]MDStat, error) {
// TODO:
// - parse global hotspares from the "unused devices" line.
mdStats := []MDStat{}
lines := strings.Split(string(mdStatData), "\n")
knownRaidTypes := make(map[string]bool)

for i, line := range lines {
if strings.TrimSpace(line) == "" || line[0] == ' ' ||
strings.HasPrefix(line, "Personalities") ||
strings.HasPrefix(line, "unused") {
continue
}
// Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5] [raid4] [raid10]
if len(knownRaidTypes) == 0 && strings.HasPrefix(line, personalitiesPrefix) {
personalities := strings.Fields(line[len(personalitiesPrefix):])
for _, word := range personalities {
word := word[1 : len(word)-1]
knownRaidTypes[word] = true
}
continue
}

deviceFields := strings.Fields(line)
if len(deviceFields) < 3 {
return nil, fmt.Errorf("%w: Expected 3+ lines, got %q", ErrFileParse, line)
}
mdName := deviceFields[0] // mdx
state := deviceFields[2] // active or inactive
state := deviceFields[2] // active, inactive, broken

mdType := "unknown" // raid1, raid5, etc.
mdType := "unknown" // raid1, raid5, etc.
var deviceStartIndex int
if len(deviceFields) > 3 { // mdType may be in the 3rd or 4th field
if isRaidType(deviceFields[3]) {
if isRaidType(deviceFields[3], knownRaidTypes) {
mdType = deviceFields[3]
} else if len(deviceFields) > 4 && isRaidType(deviceFields[4]) {
deviceStartIndex = 4
} else if len(deviceFields) > 4 && isRaidType(deviceFields[4], knownRaidTypes) {
// if the 3rd field is (...), the 4th field is the mdType
mdType = deviceFields[4]
deviceStartIndex = 5
}
}

if len(lines) <= i+3 {
return nil, fmt.Errorf("%w: Too few lines for md device: %q", ErrFileParse, mdName)
}

// Failed disks have the suffix (F) & Spare disks have the suffix (S).
// Failed (Faulty) disks have the suffix (F) & Spare disks have the suffix (S).
fail := int64(strings.Count(line, "(F)"))
spare := int64(strings.Count(line, "(S)"))
active, total, down, size, err := evalStatusLine(lines[i], lines[i+1])
Expand Down Expand Up @@ -160,6 +193,11 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) {
}
}

devices, err := evalComponentDevices(deviceFields[deviceStartIndex:])
if err != nil {
return nil, fmt.Errorf("error parsing components in md device %q: %w", mdName, err)
}

mdStats = append(mdStats, MDStat{
Name: mdName,
Type: mdType,
Expand All @@ -175,7 +213,7 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) {
BlocksSyncedPct: pct,
BlocksSyncedFinishTime: finish,
BlocksSyncedSpeed: speed,
Devices: evalComponentDevices(deviceFields),
Devices: devices,
})
}

Expand All @@ -185,11 +223,13 @@ func parseMDStat(mdStatData []byte) ([]MDStat, error) {
// check if a string's format is like the mdType
// Rule 1: mdType should not be like (...)
// Rule 2: mdType should not be like sda[0]
func isRaidType(mdType string) bool {
return !strings.ContainsAny(mdType, "([")
func isRaidType(mdType string, knownRaidTypes map[string]bool) bool {
_, ok := knownRaidTypes[mdType]
return !strings.ContainsAny(mdType, "([") && ok
}

func evalStatusLine(deviceLine, statusLine string) (active, total, down, size int64, err error) {
// e.g. 523968 blocks super 1.2 [4/4] [UUUU]
statusFields := strings.Fields(statusLine)
if len(statusFields) < 1 {
return 0, 0, 0, 0, fmt.Errorf("%w: Unexpected statusline %q: %w", ErrFileParse, statusLine, err)
Expand Down Expand Up @@ -280,17 +320,29 @@ func evalRecoveryLine(recoveryLine string) (blocksSynced int64, blocksToBeSynced
return blocksSynced, blocksToBeSynced, pct, finish, speed, nil
}

func evalComponentDevices(deviceFields []string) []string {
mdComponentDevices := make([]string, 0)
if len(deviceFields) > 3 {
for _, field := range deviceFields[4:] {
match := componentDeviceRE.FindStringSubmatch(field)
if match == nil {
continue
}
mdComponentDevices = append(mdComponentDevices, match[1])
func evalComponentDevices(deviceFields []string) ([]MDStatComponent, error) {
mdComponentDevices := make([]MDStatComponent, 0)
for _, field := range deviceFields {
match := componentDeviceRE.FindStringSubmatch(field)
if match == nil {
continue
}
descriptorIndex, err := strconv.ParseInt(match[2], 10, 32)
if err != nil {
return mdComponentDevices, fmt.Errorf("error parsing int from device %q: %w", match[2], err)
}
mdComponentDevices = append(mdComponentDevices, MDStatComponent{
Name: match[1],
DescriptorIndex: int32(descriptorIndex),
// match may contain one or more of these
// https://github.com/torvalds/linux/blob/7ec462100ef9142344ddbf86f2c3008b97acddbe/drivers/md/md.c#L8376-L8392
Faulty: strings.Contains(match[3], "(F)"),
Spare: strings.Contains(match[3], "(S)"),
Journal: strings.Contains(match[3], "(J)"),
Replacement: strings.Contains(match[3], "(R)"),
WriteMostly: strings.Contains(match[3], "(W)"),
})
}

return mdComponentDevices
return mdComponentDevices, nil
}
57 changes: 38 additions & 19 deletions mdstat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ func TestFS_MDStat(t *testing.T) {
if err != nil {
t.Fatalf("parsing of reference-file failed entirely: %s", err)
}
// TODO: Test cases to capture in future:
// WriteMostly devices
// Journal devices
// Replacement devices
// Global hotspares

refs := map[string]MDStat{
"md127": {
Expand All @@ -43,7 +48,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sdi2", "sdj2"}},
Devices: []MDStatComponent{{Name: "sdi2", DescriptorIndex: 0}, {Name: "sdj2", DescriptorIndex: 1}}},
"md0": {
Name: "md0",
Type: "raid1",
Expand All @@ -59,7 +64,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sdi1", "sdj1"}},
Devices: []MDStatComponent{{Name: "sdi1", DescriptorIndex: 0}, {Name: "sdj1", DescriptorIndex: 1}}},
"md4": {
Name: "md4",
Type: "raid1",
Expand All @@ -75,7 +80,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sda3", "sdb3"}},
Devices: []MDStatComponent{{Name: "sda3", Faulty: true, DescriptorIndex: 0}, {Name: "sdb3", Spare: true, DescriptorIndex: 1}}},
"md6": {
Name: "md6",
Type: "raid1",
Expand All @@ -91,7 +96,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 8.5,
BlocksSyncedFinishTime: 17,
BlocksSyncedSpeed: 259783,
Devices: []string{"sdb2", "sdc", "sda2"}},
Devices: []MDStatComponent{{Name: "sdb2", DescriptorIndex: 2, Faulty: true}, {Name: "sdc", DescriptorIndex: 1, Spare: true}, {Name: "sda2", DescriptorIndex: 0}}},
"md3": {
Name: "md3",
Type: "raid6",
Expand All @@ -107,7 +112,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sda1", "sdh1", "sdg1", "sdf1", "sde1", "sdd1", "sdc1", "sdb1", "sdd1", "sdd2"}},
Devices: []MDStatComponent{{Name: "sda1", DescriptorIndex: 8}, {Name: "sdh1", DescriptorIndex: 7}, {Name: "sdg1", DescriptorIndex: 6}, {Name: "sdf1", DescriptorIndex: 5}, {Name: "sde1", DescriptorIndex: 11}, {Name: "sdd1", DescriptorIndex: 3}, {Name: "sdc1", DescriptorIndex: 10}, {Name: "sdb1", DescriptorIndex: 9}, {Name: "sdd1", DescriptorIndex: 10, Spare: true}, {Name: "sdd2", DescriptorIndex: 11, Spare: true}}},
"md8": {
Name: "md8",
Type: "raid1",
Expand All @@ -123,7 +128,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 8.5,
BlocksSyncedFinishTime: 17,
BlocksSyncedSpeed: 259783,
Devices: []string{"sdb1", "sda1", "sdc", "sde"}},
Devices: []MDStatComponent{{Name: "sdb1", DescriptorIndex: 1}, {Name: "sda1", DescriptorIndex: 0}, {Name: "sdc", DescriptorIndex: 2, Spare: true}, {Name: "sde", DescriptorIndex: 3, Spare: true}}},
"md7": {
Name: "md7",
Type: "raid6",
Expand All @@ -139,7 +144,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sdb1", "sde1", "sdd1", "sdc1"}},
Devices: []MDStatComponent{{Name: "sdb1", DescriptorIndex: 0}, {Name: "sde1", DescriptorIndex: 3}, {Name: "sdd1", DescriptorIndex: 2}, {Name: "sdc1", DescriptorIndex: 1, Faulty: true}}},
"md9": {
Name: "md9",
Type: "raid1",
Expand All @@ -155,7 +160,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sdc2", "sdd2", "sdb2", "sda2", "sde", "sdf", "sdg"}},
Devices: []MDStatComponent{{Name: "sdc2", DescriptorIndex: 2}, {Name: "sdd2", DescriptorIndex: 3}, {Name: "sdb2", DescriptorIndex: 1}, {Name: "sda2", DescriptorIndex: 0}, {Name: "sde", DescriptorIndex: 4, Faulty: true}, {Name: "sdf", DescriptorIndex: 5, Faulty: true}, {Name: "sdg", DescriptorIndex: 6, Spare: true}}},
"md10": {
Name: "md10",
Type: "raid0",
Expand All @@ -171,7 +176,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sda1", "sdb1"}},
Devices: []MDStatComponent{{Name: "sda1", DescriptorIndex: 0}, {Name: "sdb1", DescriptorIndex: 1}}},
"md11": {
Name: "md11",
Type: "raid1",
Expand All @@ -187,7 +192,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sdb2", "sdc2", "sdc3", "hda", "ssdc2"}},
Devices: []MDStatComponent{{Name: "sdb2", DescriptorIndex: 0}, {Name: "sdc2", DescriptorIndex: 1}, {Name: "sdc3", DescriptorIndex: 2, Faulty: true}, {Name: "hda", DescriptorIndex: 4, Spare: true}, {Name: "ssdc2", DescriptorIndex: 3, Spare: true}}},
"md12": {
Name: "md12",
Type: "raid0",
Expand All @@ -203,7 +208,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sdc2", "sdd2"}},
Devices: []MDStatComponent{{Name: "sdc2", DescriptorIndex: 0}, {Name: "sdd2", DescriptorIndex: 1}}},
"md120": {
Name: "md120",
Type: "linear",
Expand All @@ -219,7 +224,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sda1", "sdb1"}},
Devices: []MDStatComponent{{Name: "sda1", DescriptorIndex: 1}, {Name: "sdb1", DescriptorIndex: 0}}},
"md126": {
Name: "md126",
Type: "raid0",
Expand All @@ -235,7 +240,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sdb", "sdc"}},
Devices: []MDStatComponent{{Name: "sdb", DescriptorIndex: 1}, {Name: "sdc", DescriptorIndex: 0}}},
"md219": {
Name: "md219",
Type: "unknown",
Expand All @@ -251,7 +256,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sdc", "sda"}},
Devices: []MDStatComponent{{Name: "sdb", DescriptorIndex: 2, Spare: true}, {Name: "sdc", DescriptorIndex: 1, Spare: true}, {Name: "sda", DescriptorIndex: 0, Spare: true}}},
"md00": {
Name: "md00",
Type: "raid0",
Expand All @@ -267,7 +272,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"xvdb"}},
Devices: []MDStatComponent{{Name: "xvdb", DescriptorIndex: 0}}},
"md101": {
Name: "md101",
Type: "raid0",
Expand All @@ -283,7 +288,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 0,
BlocksSyncedFinishTime: 0,
BlocksSyncedSpeed: 0,
Devices: []string{"sdb", "sdd", "sdc"}},
Devices: []MDStatComponent{{Name: "sdb", DescriptorIndex: 2}, {Name: "sdd", DescriptorIndex: 1}, {Name: "sdc", DescriptorIndex: 0}}},
"md201": {
Name: "md201",
Type: "raid1",
Expand All @@ -299,7 +304,7 @@ func TestFS_MDStat(t *testing.T) {
BlocksSyncedPct: 5.7,
BlocksSyncedFinishTime: 0.2,
BlocksSyncedSpeed: 114176,
Devices: []string{"sda3", "sdb3"}},
Devices: []MDStatComponent{{Name: "sda3", DescriptorIndex: 0}, {Name: "sdb3", DescriptorIndex: 1}}},
}

if want, have := len(refs), len(mdStats); want != have {
Expand All @@ -314,18 +319,32 @@ func TestFS_MDStat(t *testing.T) {
}

func TestInvalidMdstat(t *testing.T) {
invalidMount := [][]byte{[]byte(`
invalidMount := [][]byte{
// Test invalid Personality and format
[]byte(`
Personalities : [invalid]
md3 : invalid
314159265 blocks 64k chunks
unused devices: <none>
`),
// Test extra blank line
[]byte(`
md12 : active raid0 sdc2[0] sdd2[1]
3886394368 blocks super 1.2 512k chunks
`)}
`),
// test for impossible component state
[]byte(`
md127 : active raid1 sdi2[0] sdj2[1](Z)
312319552 blocks [2/2] [UU]
`),
// test for malformed component state
[]byte(`
md127 : active raid1 sdi2[0] sdj2[X]
312319552 blocks [2/2] [UU]
`),
}

for _, invalid := range invalidMount {
_, err := parseMDStat(invalid)
Expand Down

0 comments on commit dca5b72

Please sign in to comment.