diff --git a/controller.py b/controller.py index 73480b2..1116f42 100755 --- a/controller.py +++ b/controller.py @@ -21,6 +21,7 @@ import logging from multiprocessing import Manager, Process, Value from pathlib import Path +import psutil import signal from statistics import mean import subprocess @@ -791,22 +792,40 @@ def controller( if len(times) > 0: time_max = max(times) - for i in range(len(p_list)): - p = p_list[i] + for i, p in enumerate(p_list): # Find finished processes p["process"].join(timeout=0) - # Kill process if timeout exceeded and gdb is not used + # Halt experiment if timeout duration exceeded + # If gdb is used the timeout is not applicable if ( p["process"].is_alive() and (time.time() - p["start_time"]) > config_qemu["timeout"] and not config_qemu.get("gdb", False) ): - clogger.error( - f"Process {p['process'].name} ran into timeout and was killed!" - ) + clogger.warning(f"Experiment {p['experiment_index']} ran into timeout") + # Search for qemu process and kill if found + qemu_process_name = f"qemu{p['experiment_index']}" + for process in psutil.process_iter(): + if process.name() != qemu_process_name: + continue + clogger.debug(f"{process.name()} killed") + process.kill() + break + else: + clogger.debug(f"{qemu_process_name} not found to kill") + # Terminate worker process p["process"].terminate() p["process"].join() + # Tell hdf5collector about timeout + queue_output.put( + { + "index": p["experiment_index"], + "faultlist": faultlist[p["experiment_index"]]["faultlist"], + "endpoint": -1, + "end_reason": "timeout", + } + ) if p["process"].is_alive() is False: # Recalculate moving average