Skip to content

Commit

Permalink
DNM: Attempt to make run_training error on error
Browse files Browse the repository at this point in the history
Signed-off-by: Ihar Hrachyshka <[email protected]>
  • Loading branch information
booxter committed Jul 24, 2024
1 parent 9fdeb87 commit a1f56f3
Showing 1 changed file with 10 additions and 7 deletions.
17 changes: 10 additions & 7 deletions src/instructlab/training/main_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,6 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
command.append("--cpu_offload_optimizer_pin_memory")

print(f"\033[92mRunning command: {' '.join(command)}\033[0m")
process = None
try:
process = StreamablePopen(
f"{train_args.ckpt_output_dir}/full_logs_global{torch_args.node_rank}.log",
Expand All @@ -674,17 +673,21 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
print("Process interrupted by user")
except Exception as e:
print(f"An error occurred: {str(e)}")
finally:
if "process" not in locals() or process is None:
return

raise
else:
print("\033[91mTerminating process 🤖\033[0m")
process.terminate()
try:
process.wait(timeout=60)
except subprocess.TimeoutExpired:
rc = process.wait(timeout=60)
if rc:
print(f"Process exited with code {rc}")
# TODO: raise something instructlab specific
raise SystemExit(rc)
except subprocess.TimeoutExpireds as e:
print("\033[91mProcess did not terminate in time, killing it.\033[0m")
process.kill()
# TODO: raise something instructlab specific
raise SystemExit(1) from e


if __name__ == "__main__":
Expand Down

0 comments on commit a1f56f3

Please sign in to comment.