From 3a74c3865ab6a8122a1ec773da5173686b7e3a0c Mon Sep 17 00:00:00 2001 From: Alexander Wagner Date: Thu, 2 Nov 2023 13:08:00 +0100 Subject: [PATCH] controller: Rewrite timout mechanism Kill the qemu process as it stalls the ARCHIE execution. After the qemu process is killed, the python worker can be terminated. The last step in the timeout mechanism is a write to hdf5collector, s.t.the triggered timeout is recorded within the hdf5. --- controller.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/controller.py b/controller.py index 73480b2..1116f42 100755 --- a/controller.py +++ b/controller.py @@ -21,6 +21,7 @@ import logging from multiprocessing import Manager, Process, Value from pathlib import Path +import psutil import signal from statistics import mean import subprocess @@ -791,22 +792,40 @@ def controller( if len(times) > 0: time_max = max(times) - for i in range(len(p_list)): - p = p_list[i] + for i, p in enumerate(p_list): # Find finished processes p["process"].join(timeout=0) - # Kill process if timeout exceeded and gdb is not used + # Halt experiment if timeout duration exceeded + # If gdb is used the timeout is not applicable if ( p["process"].is_alive() and (time.time() - p["start_time"]) > config_qemu["timeout"] and not config_qemu.get("gdb", False) ): - clogger.error( - f"Process {p['process'].name} ran into timeout and was killed!" - ) + clogger.warning(f"Experiment {p['experiment_index']} ran into timeout") + # Search for qemu process and kill if found + qemu_process_name = f"qemu{p['experiment_index']}" + for process in psutil.process_iter(): + if process.name() != qemu_process_name: + continue + clogger.debug(f"{process.name()} killed") + process.kill() + break + else: + clogger.debug(f"{qemu_process_name} not found to kill") + # Terminate worker process p["process"].terminate() p["process"].join() + # Tell hdf5collector about timeout + queue_output.put( + { + "index": p["experiment_index"], + "faultlist": faultlist[p["experiment_index"]]["faultlist"], + "endpoint": -1, + "end_reason": "timeout", + } + ) if p["process"].is_alive() is False: # Recalculate moving average