From 20be81f81853a7db6f10fa4754dca59fa16412c5 Mon Sep 17 00:00:00 2001 From: Alexander Temerev Date: Thu, 21 Mar 2024 11:55:52 +0100 Subject: [PATCH] [BBPBGLIB-1139] Missing exception logging on configuration errors (#142) ## Context Sometimes, when there is an error (typo) in sonata config file, the run was just failing silently, without any exception logged. ## Scope There is a hack in commands.py, which synchronizes exception logging by allowing only a single node to log the exception to stderr (otherwise, production runs would be flooded by exception logs). It requires a file to write all MPI ranks for all failed runs, and choosing the first rank for reporting the exception. This temp file was supposed to be deleted on startup, but if the exception happens early enough (e.g. if there is a typo in the config file), the temp file removal code was not reached, and it messed with subsequent runs and their exception logging. This attempt at leader election is a hack, to say the least, but for now it should work with this fix. We might want to think how to implement this properly later; temp files are fragile and non-atomic. The code removing the temp file has been moved to earlier position in the execution, to commands.py. ## Testing It is hard to test for this in the multi-node environment. The change is minor. We might want to write a unit test that checks if this file is actually removed after starting commands.py, but does it make sense? ## Review * [X] PR description is complete * [X] Coding style (imports, function length, New functions, classes or files) are good * [?] Unit/Scientific test added * [X] Updated Readme, in-code, developer documentation --- neurodamus/commands.py | 5 +++++ neurodamus/core/_neurodamus.py | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/neurodamus/commands.py b/neurodamus/commands.py index 955300c6..0aab80ba 100644 --- a/neurodamus/commands.py +++ b/neurodamus/commands.py @@ -74,6 +74,11 @@ def neurodamus(args=None): # Warning control before starting the process _filter_warnings() + # Some previous executions may have left a bad exception node file + # This is done now so it's a very early stage and we know the mpi rank + if MPI.rank == 0 and os.path.exists(EXCEPTION_NODE_FILENAME): + os.remove(EXCEPTION_NODE_FILENAME) + try: Neurodamus(config_file, True, logging_level=log_level, **options).run() except ConfigurationError as e: # Common, only show error in Rank 0 diff --git a/neurodamus/core/_neurodamus.py b/neurodamus/core/_neurodamus.py index a66f79f8..7d94a2a4 100644 --- a/neurodamus/core/_neurodamus.py +++ b/neurodamus/core/_neurodamus.py @@ -44,11 +44,6 @@ def _init(cls, **kwargs): setup_logging(GlobalConfig.verbosity, log_name, MPI.rank) log_stage("Initializing Neurodamus... Logfile: " + log_name) - # Some previous executions may have left a bad exception node file - # This is done now so it's a very early stage and we know the mpi rank - if MPI.rank == 0 and os.path.exists(EXCEPTION_NODE_FILENAME): - os.remove(EXCEPTION_NODE_FILENAME) - # Load mods if not available cls._load_nrnmechlibs() log_verbose("Mechanisms (mod) library(s) successfully loaded")