Skip to content

Commit

Permalink
Merge pull request open-power#791 from FarooqAbdulla02/watchdog_tests
Browse files Browse the repository at this point in the history
"watchdog Kernel module" functionality with diffrent supported parameters.
  • Loading branch information
abdhaleegit authored Mar 29, 2024
2 parents b0ffcc2 + 07b51fb commit 1fac29c
Show file tree
Hide file tree
Showing 2 changed files with 318 additions and 8 deletions.
15 changes: 15 additions & 0 deletions common/OpTestHost.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,6 +976,21 @@ def host_check_binary(self, i_dir, i_file, console=0):
l_msg = "Executable file %s/%s is not present" % (i_dir, i_file)
log.debug(l_msg)
return False
def host_check_module_loaded(self, i_module):
''''
Function to check kernel module is loaded.
:params i_module: Name of kernel module
:return: True when loaded, else False.
:rtype: boolean.
'''
with open('/proc/modules', 'r') as modules_file:
for line in modules_file:
module_info = line.split()
if module_info[0] == i_module:
return True
return False



class OpTestLPAR(OpTestHost):
Expand Down
311 changes: 303 additions & 8 deletions testcases/PowerNVDump.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,9 @@ def verify_dump_file(self, boot_type=BootType.NORMAL, dump_place="local"):
Verify if dump file present
'''
if self.distro == "rhel":
self.c.run_command("cp /etc/kdump.conf_bck /etc/kdump.conf")
self.cv_HOST.host_run_command("cp /etc/kdump.conf_bck /etc/kdump.conf", timeout=60)
if self.distro == "sles":
self.c.run_command("cp /etc/sysconfig/kdump_bck /etc/sysconfig/kdump")
self.cv_HOST.host_run_command("cp /etc/sysconfig/kdump_bck /etc/sysconfig/kdump")
if dump_place == "local":
crash_content_after = self.c.run_command(
"ls -l /var/crash | grep '^d'| awk '{print $9}'")
Expand Down Expand Up @@ -359,6 +359,7 @@ def kernel_crash(self, crash_type="echo_c"):
reboot. It has below steps
1. Enable reboot on kernel panic: echo 10 > /proc/sys/kernel/panic
2. Trigger kernel crash: echo c > /proc/sysrq-trigger
3. If trigger requeted by watchdog then call watchdog trigger event.
return BMC_CONST.FW_SUCCESS or raise OpTestError
'''
self.c.run_command("uname -a")
Expand All @@ -369,22 +370,27 @@ def kernel_crash(self, crash_type="echo_c"):
self.c.run_command("echo 10 > /proc/sys/kernel/panic")
# Enable sysrq before triggering the kernel crash
self.c.pty.sendline("echo 1 > /proc/sys/kernel/sysrq")
if crash_type == "echo_c":
# Check if crash requested by watchdog event, if yes then call the
# watchdog_run_command function to execute the event.
if crash_type == "watchdog":
self.c.pty.sendline("./watchdog-countdown")
elif crash_type == "echo_c":
self.c.pty.sendline("echo c > /proc/sysrq-trigger")
elif crash_type == "hmc":
self.cv_HMC.run_command("chsysstate -r lpar -m %s -n %s -o dumprestart" %
(self.system_name, self.lpar_name), timeout=300)

done = False
boot_type = BootType.NORMAL
rc = -1
while not done:
try:
# MPIPL completion + system reboot would take time, keeping it
# 600 seconds. Post MPIPL, kernel will offload vmcore and reboot
# 1800 seconds. Post MPIPL, kernel will offload vmcore and reboot
# system. Hostboot will run istep 10.1 in normal boot only. So
# check for istep 10.1 to detect normal boot.
rc = self.c.pty.expect(
["ISTEP 10. 1", "saving vmcore complete", "saved vmcore", "Rebooting."], timeout=600)
["ISTEP 10. 1", "saving vmcore complete", "saved vmcore", "Rebooting."], timeout=1800)
except KernelFADUMP:
log.debug("====================MPIPL boot started==================")
# if fadump is enabled system should start MPIPL after kernel crash
Expand Down Expand Up @@ -424,6 +430,10 @@ def kernel_crash(self, crash_type="echo_c"):
self.cv_SYSTEM.set_state(OpSystemState.UNKNOWN)
done = True

# This will ensure the system state as "Not Activated" and captures the state of
# LPAR from HMC as "Not Activated" before rebooting the LPAR.
if self.cv_HMC.get_lpar_state() == "Not Activated":
return
self.cv_SYSTEM.goto_state(OpSystemState.OS)
log.debug("System booted fine to host OS...")
return boot_type
Expand Down Expand Up @@ -928,8 +938,8 @@ def setUp(self):
def setup_san(self):
self.cv_SYSTEM.goto_state(OpSystemState.OS)
if self.distro == "rhel":
self.c.run_command("sfdisk --delete %s" % self.dev_path)
self.c.run_command("echo , | sfdisk --force %s" % self.dev_path, timeout=120)
self.cv_HOST.host_run_command("sfdisk --delete %s" % self.dev_path)
self.cv_HOST.host_run_command("echo , | sfdisk --force %s" % self.dev_path, timeout=120)
try: self.c.run_command("umount %s1" % self.dev_path)
except: pass
self.c.run_command("dd if=/dev/zero bs=512 count=512 of=%s1" % self.dev_path)
Expand All @@ -941,7 +951,8 @@ def setup_san(self):
self.c.run_command("sed -i '/\/var\/crash %s/d' /etc/fstab;"
"echo '%s1 /var/crash %s defaults 0 0' >> /etc/fstab; sync" % (
self.filesystem, self.dev_path, self.filesystem))
self.c.run_command("mount -t %s %s1 /var/crash" % (self.filesystem, self.dev_path))
self.c.run_command("systemctl daemon-reload")
self.cv_HOST.host_run_command("mount -t %s %s1 /var/crash" % (self.filesystem, self.dev_path), timeout=60)
else:
self.c.run_command("sed -i 's/-l --message-level/-l -F --message-level/' /etc/kdump.conf; sync")
self.c.run_command("sed -i '/^raw/ s/^#*/#/' /etc/kdump.conf;"
Expand Down Expand Up @@ -1352,8 +1363,292 @@ def runTest(self):
self.cv_SYSTEM.goto_state(OpSystemState.OFF)
self.cv_SYSTEM.goto_state(OpSystemState.OS)

class OpTestWatchdog(PowerNVDump):
'''
This test verifies "watchdog module" with diffrent scenarios like
1. Watchdog module load and unload.
2. watchdog with action 1/ Reboot LPAR.
3. watchdog with action 0/ Halt LPAR.
4. watchdog with action 2 /dump collect
a. dump collect to a local disk
b. dump collect to a NFS disk
c. dump collect to a SAN FC disk
'''

def setUp(self):
super(OpTestWatchdog, self).setUp()
self.mg_system = self.cv_HMC.mg_system

if not self.cv_HMC.is_lpar_in_managed_system(self.mg_system, self.cv_HMC.lpar_name):
raise OpTestError("Lpar %s not found in managed system %s" % (
self.cv_HMC.lpar_name, self.mg_system))

def get_watchdog_tool(self):
'''
This funtions copies compiled watchdog-countdown Tool/directory
to root directory,using this tool we can trigger the watchdog events.
'''
filename = "watchdog-countdown"
self.cv_HOST.copy_test_file_to_host(filename, dstdir="/root")

def module_load_with_parameters(self, i_module, timeout, action):
'''
This function will load the module using modprobe
with timeout and action parameters and valiadates
module load.
:params
i_module: watchdog Module name "pseries_wdt".
timeout: timeout value while loading module
action: type of reset.
Action_0 : halt the LPAR
Action_1 : reboot the LPAR
Action_2 : collect dump and reboot LPAR
:rtype int
'''
try:
cmd = f"modprobe {i_module} timeout={timeout} action={action}"
self.cv_HOST.host_run_command(cmd)
self.script_timeout = timeout
self.script_action = action
return self.script_timeout, self.script_action

except CommandFailed as c:
l_msg = "Error in loading the module %s, modprobe failed: %s" % (
i_module, str(c))
raise OpTestError(l_msg)

def validate_timeout_and_action(self):
'''
Funtion to validate timeout and action of watchdog module.
'''
self.get_watchdog_timeout_value("pseries_wdt")
self.get_watchdog_action_value("pseries_wdt")

if (self.system_timeout_value == self.script_timeout) and (self.watchdog_action_mode == self.script_action):
return True
else:
self.fail("Timeout and action values of watchdog module"
"are incorrect,Please check logs")

def get_watchdog_timeout_value(self, i_module):
'''
Funtion to get timeout value of a watchdog_module
that set by script.
:params i_module: watchdog module i.e "pseries_wdt"
:rtype int
'''
cmd = f"cat /sys/module/{i_module}/parameters/timeout"
output = self.cv_HOST.host_run_command(cmd)
self.system_timeout_value = int(output[0])
return self.system_timeout_value

def get_watchdog_action_value(self, i_module):
'''
Funtion to get action value of a watchdog_module
that set by script.
:params i_module: watchdog module i.e "pseries_wdt"
:rtype int
'''
cmd = f"cat /sys/module/{i_module}/parameters/action"
output = self.cv_HOST.host_run_command(cmd)
self.watchdog_action_mode = int(output[0])
return self.watchdog_action_mode

def module_load(self):
'''
Funtion to load watchdog module.
'''
self.cv_SYSTEM.cv_HOST.host_load_module("pseries_wdt")

def module_unload(self, i_module):
'''
This function will unload the module using modprobe
and validates module unload.
'''
try:
self.cv_HOST.host_run_command("modprobe -r %s" % i_module)
except CommandFailed as c:
l_msg = "Error in unloading the module %s, modprobe -r failed: %s" % (
i_module, str(c))
raise OpTestError(l_msg)
if self.cv_HOST.host_check_module_loaded(i_module):
raise OpTestError(f"{i_module} module still present even after unload,Please check logs")

def check_module_support(self):
'''
Function to check the watchdog module is supported with the current kernel
if not supported, none of the test cases executed.
:return : True if supported else False
:rtype : boolean
'''
cmd = "find /lib/modules/$(uname -r) -type f -name '*.ko*' | grep pseries-wdt"
try:
self.cv_HOST.host_run_command(cmd)
return True

except CommandFailed as c:
msg = " Watchdog module is not supported in this kernel, Please check."
raise OpTestError(msg)

def check_module_load_unload(self, i_module="pseries_wdt"):
'''
This function loads and unloads the watchdog module
to the count as per user input
'''
conf = OpTestConfiguration.conf
self.count = conf.args.count or "10"
if self.check_module_support:
for _ in range(int(self.count)):
try:
self.module_unload(i_module)
time.sleep(2)
log.info("Module unloaded ")
self.module_load()
log.info("module got loaded ")
except CommandFailed as c:
msg = "watchdog module load and unload has issues,Please check logs."
raise OpTestError(msg)

def check_wd_action_one(self):
'''
Function to trigger watchdog event with action set to "1"
which reboots the LPAR.
'''
if self.check_module_support():
self.module_unload("pseries_wdt")
self.module_load_with_parameters("pseries_wdt", 60, 1)
if self.validate_timeout_and_action():
log.info("=============== Testing watchdog with Action 1 ===============")
self.get_watchdog_tool()
self.kernel_crash(crash_type="watchdog")
if not self.cv_HMC.get_lpar_state() == "Running":
self.fail("System state is mismatching after the watchdog event,Please check logs")

def check_wd_action_zero(self):
'''
Function to trigger watchdog event with action set to "0"
which Shutdown the LPAR.
'''
if self.check_module_support():
self.module_unload("pseries_wdt")
self.module_load_with_parameters("pseries_wdt", 120, 0)
if self.validate_timeout_and_action():
log.info("=============== Testing watchdog with Action 0 ===============")
self.get_watchdog_tool()
self.kernel_crash(crash_type="watchdog")
if not self.cv_HMC.get_lpar_state() == "Not Activated":
self.fail("System state is mismatching after the watchdog event,Please check logs")

def check_wd_overNFS(self):
'''
Function to execute watchdog test case and collect
crash dump over remote based NFS directory.
'''
conf = OpTestConfiguration.conf
self.dump_server_ip = conf.args.dump_server_ip
self.kdumpNFS = KernelCrash_KdumpNFS()
self.kdumpNFS.setUp()
if self.check_module_support():
if self.distro == "rhel":
self.cv_HOST.host_check_command("kdumpctl")
obj = OpTestInstallUtil.InstallUtil()
if not obj.update_kernel_cmdline(self.distro, args="crashkernel=2G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G",
reboot=True, reboot_cmd=True):
self.fail("KernelArgTest failed to update kernel args")
time.sleep(5)
self.cv_SYSTEM.goto_state(OpSystemState.OFF)
self.cv_SYSTEM.goto_state(OpSystemState.OS)
os_level = self.cv_HOST.host_get_OS_Level()
self.cv_HOST.host_run_command("stty cols 300;stty rows 30")
self.cv_HOST.host_enable_kdump_service(os_level)
self.module_unload("pseries_wdt")
self.module_load_with_parameters("pseries_wdt", 60, 2)

if self.validate_timeout_and_action():
if not (self.dump_server_ip or self.dump_server_pw):
raise self.skipTest("Provide --dump-server-ip and --dump-server-pw "
"for network dumps")
self.setup_pwdless_auth()
self.setup_test("net")
self.kdumpNFS.setup_nfs()
log.info("=============== Testing kdump over nfs ===============")
self.get_watchdog_tool()
boot_type = self.kernel_crash(crash_type="watchdog")
self.verify_dump_file(boot_type, dump_place="net")
self.setup_test("net")

def check_wd_overSAN(self):
'''
Function to execute watchdog test case and collect
crash dump over remote based NFS directory.
'''
self.kdumpSAN = KernelCrash_KdumpSAN()
self.kdumpSAN.setUp()
if self.check_module_support():
if self.distro == "rhel":
self.cv_HOST.host_check_command("kdumpctl")
obj = OpTestInstallUtil.InstallUtil()
if not obj.update_kernel_cmdline(self.distro, args="crashkernel=2G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G",
reboot=True, reboot_cmd=True):
self.fail("KernelArgTest failed to update kernel args")
time.sleep(5)
self.cv_SYSTEM.goto_state(OpSystemState.OFF)
self.cv_SYSTEM.goto_state(OpSystemState.OS)
os_level = self.cv_HOST.host_get_OS_Level()
self.cv_HOST.host_run_command("stty cols 300;stty rows 30")
self.cv_HOST.host_enable_kdump_service(os_level)
self.module_unload("pseries_wdt")
self.module_load_with_parameters("pseries_wdt", 60, 2)
if self.validate_timeout_and_action():
self.setup_test()
self.kdumpSAN.setup_san()
log.info("=============== Testing kdump over SAN ===============")
self.get_watchdog_tool()
boot_type = self.kernel_crash(crash_type="watchdog")
self.verify_dump_file(boot_type)
self.setup_test()

def check_wd_localdisk(self):
'''
Function to execute watchdog test case and collect
crash dump on local "/var/crash" directory.
'''
if self.check_module_support():
if self.distro == "rhel":
self.cv_HOST.host_check_command("kdumpctl")
obj = OpTestInstallUtil.InstallUtil()
if not obj.update_kernel_cmdline(self.distro, args="crashkernel=2G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G",
reboot=True, reboot_cmd=True):
self.fail("KernelArgTest failed to update kernel args")
time.sleep(5)
self.cv_SYSTEM.goto_state(OpSystemState.OFF)
self.cv_SYSTEM.goto_state(OpSystemState.OS)
os_level = self.cv_HOST.host_get_OS_Level()
self.cv_HOST.host_run_command("stty cols 300;stty rows 30")
self.cv_HOST.host_enable_kdump_service(os_level)
self.module_unload("pseries_wdt")
self.module_load_with_parameters("pseries_wdt", 60, 2)
if self.validate_timeout_and_action():
self.setup_test()
self.get_watchdog_tool()
boot_type = self.kernel_crash(crash_type="watchdog")
self.verify_dump_file(boot_type)

def runTest(self):
self.check_module_load_unload()
self.check_wd_action_one()
self.check_wd_localdisk()
self.check_wd_overSAN()
self.check_wd_overNFS()
self.check_wd_action_zero()


def crash_suite():
s = unittest.TestSuite()
s.addTest(OpTestWatchdog())
s.addTest(KernelCrash_OnlyKdumpEnable())
s.addTest(KernelCrash_KdumpSMT())
s.addTest(KernelCrash_KdumpSSH())
Expand Down

0 comments on commit 1fac29c

Please sign in to comment.