From a1f56f3335bd3e0be10e218922be9e38e3a5cac8 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Wed, 24 Jul 2024 18:21:09 -0400 Subject: [PATCH] DNM: Attempt to make run_training error on error Signed-off-by: Ihar Hrachyshka --- src/instructlab/training/main_ds.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index b56d5b85..b0edc9bc 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -663,7 +663,6 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: command.append("--cpu_offload_optimizer_pin_memory") print(f"\033[92mRunning command: {' '.join(command)}\033[0m") - process = None try: process = StreamablePopen( f"{train_args.ckpt_output_dir}/full_logs_global{torch_args.node_rank}.log", @@ -674,17 +673,21 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: print("Process interrupted by user") except Exception as e: print(f"An error occurred: {str(e)}") - finally: - if "process" not in locals() or process is None: - return - + raise + else: print("\033[91mTerminating process 🤖\033[0m") process.terminate() try: - process.wait(timeout=60) - except subprocess.TimeoutExpired: + rc = process.wait(timeout=60) + if rc: + print(f"Process exited with code {rc}") + # TODO: raise something instructlab specific + raise SystemExit(rc) + except subprocess.TimeoutExpireds as e: print("\033[91mProcess did not terminate in time, killing it.\033[0m") process.kill() + # TODO: raise something instructlab specific + raise SystemExit(1) from e if __name__ == "__main__":