From 9609edb207dadcd054caf982a54bbef4483f3a38 Mon Sep 17 00:00:00 2001 From: Bruce Johnston Date: Wed, 15 May 2024 17:03:02 -0400 Subject: [PATCH] Our test code now assumes that we are using the lvm devices file, as this is now the default on most distros. As such we've added cleanup code to handle such situations. - Added an additional check function called checkLVMDevicesFile() that will examine the lvm devices file and remove any orphaned entries. Orphaned entries exist in the file but have no associated device. - Renamed checkLVM to checkDeviceStack and reworked the code to so that it handles all our devices, including the removal of devices from the lvm devices file. - Make sure we don't try to use the devices file if the distro does not support that. Signed-off-by: Bruce Johnston --- perl/bin/checkServer.pl | 243 +++++++++++++++++++++++++++------------- 1 file changed, 167 insertions(+), 76 deletions(-) diff --git a/perl/bin/checkServer.pl b/perl/bin/checkServer.pl index 9283318..24296c7 100755 --- a/perl/bin/checkServer.pl +++ b/perl/bin/checkServer.pl @@ -384,7 +384,8 @@ sub main { ($machine eq 's390x') ? () : \&checkKDumpConfig, ($machine eq 's390x') ? () : \&checkKExecLoaded, \&checkClocksource, - \&checkLVM, + \&checkDeviceStack, + \&checkLVMDevicesFile, \&checkLVMConf, \&checkModules, \&checkPythonHacks, @@ -2042,9 +2043,9 @@ sub checkKernLog { } ###################################################################### -# Check that the machine has no logical volumes set up. +# Check that the machine has no block devices we don't expect to see. ## -sub checkLVM { +sub checkDeviceStack { # Start by checking to make sure we can run vgscan and that it isn't in a # state that would cause things to hang. my $timeout = 30; @@ -2066,17 +2067,48 @@ sub checkLVM { # configuration. my $vgPattern = _generateScratchVGPattern(); + # List of known base storage device names on PMI Farms + my @deviceNames = getTestBlockDeviceNames(); + + # To start, check for any pv/vg pairs that have no lvs in them. + # This should only ever happen on the very top device in the stack + # that failed between creating a volume group and creating its lv. + open(my $fh, "sudo pvs --noheadings -o pv_name,vg_name 2>/dev/null |") + || _die("couldn't run pvs; OS_ERROR = \"$OS_ERROR\""); + @outputLines = <$fh>; + close($fh) || _die("couldn't run pvs; $CHILD_ERROR"); + my %physicalVolumes = map { + my @words = split(" ", $_); + $words[0] => ($words[1] // "") + } @outputLines; + my @pvs = keys(%physicalVolumes); + + @pvs = grep { $physicalVolumes{$_} !~ /^($vgPattern)$/ } @pvs; + + foreach my $pv (@pvs) { + my $vg = $physicalVolumes{$pv}; + my $lv_count = `sudo vgs -o lv_count --noheadings $vg`; + if ($lv_count == 0) { + my $removeCmd = "sudo vgs $vg && sudo vgremove --force $vg;\n" + . "sudo pvs $pv && sudo pvremove --force $pv && vgscan"; + error("Found empty vg $vg in pv $pv", $removeCmd); + } + } + # Get the tree of block devices with name and type, forced to ASCII - # representation. - open(my $fh, "env LANG=C lsblk -noheadings -o NAME,TYPE |") + # representation. Use paths so we can use them for lvmdevices stuff + # if its supported. Use ascii so its clearer what characters we should + # match on. + open($fh, "env LANG=C lsblk --paths --ascii --noheadings -o NAME,TYPE |") || _die("couldn't run lsblk; OS_ERROR = \"$OS_ERROR\""); my @volumes = (); foreach my $line (<$fh>) { chomp $line; - if ($line =~ /^\s*([`|-]-)*(?([\w.-])+)\s+(?\w+)$/) { - my $device = {name => $+{name}, type => $+{type}}; + if ($line =~ /^\s*([`|-]-)*(?([^\/]*\/)+(?([\w.-])+))\s+(?\w+)$/) { + my $device = {path => $+{path}, name => $+{name}, type => $+{type}}; if ($device->{name} !~ /$vgPattern/) { + # Important for device ordering unshift(@volumes, $device); } else { $log->debug("Scratch device '$device->{name}' ignored"); @@ -2087,52 +2119,37 @@ sub checkLVM { close($fh) || _die("couldn't run lsblk; $CHILD_ERROR"); foreach my $volume (@volumes) { - if ($volume->{type} eq "lvm") { - my $split = `dmsetup splitname --noheadings $volume->{name}`; + my $name = $volume->{name}; + my $type = $volume->{type}; + my $path = $volume->{path}; + if ($type eq "lvm") { + my $split = `dmsetup splitname --noheadings $name`; my ($vg,$lv) = split(/:/, $split); # Remove the lv, and if it's the last lv, remove the vg too my $lvremove = "sudo lvremove --force $vg/$lv;\n" . "lv_count=`sudo vgs -o lv_count --noheadings $vg`;\n" - . "[ \"\$lv_count\" != 0 ] || sudo vgremove --force $vg"; + . "[ \$lv_count -gt 0 ] || sudo vgremove --force $vg"; error("Found LVM logical volume $lv", $lvremove); - } elsif ($volume->{type} eq "dm" or $volume->{type} eq "vdo") { - my $name = $volume->{name}; - error("Found device mapper target $name", - "sudo dmsetup remove $name || sudo dmsetup remove --force $name"); + removeFromLVMDevicesFile($path); + } elsif ($type eq "dm" or $type eq "vdo") { + my $dmremove = "sudo dmsetup remove $name || sudo dmsetup remove --force $name"; + error("Found device mapper target $name", $dmremove); + removeFromLVMDevicesFile($path); + } elsif ($type =~ /raid/ && _isPMIFarm()) { + my $raidremove = "sudo mdadm --stop $path;\n" + . "sudo mdadm --zero-superblock " . join(" ", @deviceNames); + error("Found raid device $name", $raidremove); + removeFromLVMDevicesFile($path); } } - # Get a list of volume groups for removal in case any are left after - # the initial pass of lvm fixes. - open($fh, "sudo vgs --noheadings -o vg_name 2>/dev/null |") - || _die("couldn't run vgs; OS_ERROR = \"$OS_ERROR\""); - @outputLines = <$fh>; - close($fh) || _die("couldn't run vgs; $CHILD_ERROR"); - map { chomp; s/^\s*|\s*$//g; } @outputLines; - @outputLines = grep(!/^($vgPattern)/, @outputLines); - foreach my $vg (@outputLines) { - error("found LVM volume group $vg", - "sudo vgs $vg && sudo vgremove --force $vg"); - } - - # Get a list of physical volumes for removal in case any are left - # after the initial pass of lvm fixes. - open($fh, "sudo pvs --noheadings -o pv_name,vg_name 2>/dev/null |") - || _die("couldn't run pvs; OS_ERROR = \"$OS_ERROR\""); - @outputLines = <$fh>; - close($fh) || _die("couldn't run pvs; $CHILD_ERROR"); - my %physicalVolumes = map { - my @words = split(" ", $_); - $words[0] => ($words[1] // "") - } @outputLines; - @volumes = keys(%physicalVolumes); - - # Remove any devices associated with standard volume groups. - @volumes = grep { $physicalVolumes{$_} !~ /^($vgPattern)$/ } @volumes; - - if (@volumes) { - error("found LVM physical volumes @volumes", - "sudo pvremove -f @volumes && vgscan"); + # In case any are left after the initial pass, try to remove all + # pvs/vgs again. + foreach my $pv (@pvs) { + my $vg = $physicalVolumes{$pv}; + error("found LVM physical volume $pv", + "sudo vgs $vg && sudo vgremove --force $vg;\n" + . "sudo pvs $pv && sudo pvremove --force $pv && sudo vgscan"); } # Check for warnings of orphaned physical volumes since their existence @@ -2147,6 +2164,59 @@ sub checkLVM { error("found orphaned physical volume", "sudo pvscan --cache"); } + + # Last resort. Hit it with the sledgehammer + if (_isPMIFarm()) { + foreach my $raidDisk (@deviceNames) { + my $hasFS = `wipefs -n $raidDisk`; + if ($hasFS ne "") { + my $wipeCmd = "wipefs --all --force $raidDisk;\n" + . "dd if=/dev/zero of=$raidDisk bs=1M count=2000"; + error("$raidDisk is not clean", $wipeCmd); + } + } + } +} + +###################################################################### +# Remove device from lvm devices file (if it exists). +## +sub removeFromLVMDevicesFile { + my ($path) = @_; + # check whether lvmdevices is set up or not + my $status = `sudo lvmdevices 2>&1`; + if ($status =~ m/Devices file not enabled/m) { + return; + } + push(@fixes, "sudo lvmdevices --deldev $path"); +} + +###################################################################### +# Clean up orphaned devices in lvm devices file (if it exists). +## +sub checkLVMDevicesFile { + # check whether lvmdevices is set up or not + my $status = `sudo lvmdevices 2>&1`; + if ($status =~ m/Devices file not enabled/m) { + return; + } + + my @devices; + open(my $fh, "sudo lvmdevices --check |") + || _die("couldn't run lvmdevices; OS_ERROR = \"$OS_ERROR\""); + foreach my $line (<$fh>) { + chomp $line; + if ($line =~ m/.*DEVNAME=([^\s]*).*/m) { + push(@devices, $1); + } + } + + close($fh) || _die("couldn't run lvmdevices; $CHILD_ERROR"); + + foreach my $device (@devices) { + error("found orphaned device $device in LVM devices file", + "sudo lvmdevices --yes --deldev $device"); + } } ###################################################################### @@ -2536,38 +2606,59 @@ sub checkISCSIInitiator { ######################################################################## # Make sure we can identify the test storage device on this machine. -# -# N.B.: Make sure this stays in sync with the code in -# Permabit::LabUtils::getTestBlockDeviceNames! ## sub checkTestStorageDevice { - my $gotTestDevice = 0; - my @testDevices = ( - "/dev/vdo_scratch", - "/dev/vdo_scratchdev_*", - "/dev/md0", - "/dev/xvda2", - "/dev/sda8" - ); + getTestBlockDeviceNames(); +} - if (_isPMIFarm()) { - my @pmiScratchDevices = glob("/dev/vdo_scratchdev_vd*"); - if (!@pmiScratchDevices) { - error("unable to locate test storage device"); - return; - } - push(@testDevices, @pmiScratchDevices); - } - my $megaraid = getScamVar("MEGARAID"); - if ($megaraid) { - chomp($megaraid); - unshift(@testDevices, "$megaraid-part1"); - } - foreach my $device (@testDevices) { - if (-b $device) { - return; - } - } - # no fix available - error("unable to locate test storage device"); +############################################################################# +# Get the name of the scratch block devices for a test to use. +# +# @return array ref of test device names (full path) +## +sub getTestBlockDeviceNames { + # Just ask the host which of a standard list of devices exists on the system. + # Use the first group of devices in the list that we actually find. Optimize + # this query by sending just a single command to the remote host. + # + # The list of devices is: + # 1 - partition #1 on a Megaraid controller (VDO-PMI parkst) + # 2 - /dev/vdo_scratch (afarms & lfarms) + # 3 - /dev/vdo_scratchdev_* (raid setup) + # 4 - /dev/md0 (ALBIREO-PMI parkst) + # 5 - /dev/xvda2 (afarms & lfarms) + # 6 - /dev/sda8 (vfarms) + # + # N.B.: Make sure this stays in sync with the code in + # Permabit::LabUtils::getTestBlockDeviceNames! + # + my $code = <<'EOF'; +for D in "`/sbin/scam MEGARAID`-part1" "/dev/vdo_scratch" "/dev/vdo_scratchdev_*" "/dev/md0" "/dev/xvda2" "/dev/sda8"; +do + F=0; + for E in $D; + do + if test -b $E; + then + echo $E; + F=1; + fi; + done; + if test $F -eq 1; + then + break; + fi; +done; +EOF + + open(my $fh, "$code 2>/dev/null|") + || _die("unable to locate test storage device: $ERRNO"); + my @devices = <$fh>; + close($fh) || _die("unable to locate test storage device: $CHILD_ERROR"); + chomp(@devices); + if (scalar(@devices) == 0) { + error("Unable to locate test storage device"); + } + return @devices; } +