Skip to content

Commit

Permalink
ux: Improve log message for lock timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
eddiebergman committed Dec 11, 2024
1 parent 722ef9d commit f629096
Showing 1 changed file with 20 additions and 6 deletions.
26 changes: 20 additions & 6 deletions neps/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
TypeVar,
)

from portalocker import portalocker

from neps.env import (
FS_SYNC_GRACE_BASE,
FS_SYNC_GRACE_INC,
Expand Down Expand Up @@ -529,12 +531,24 @@ def run(self) -> None: # noqa: C901, PLR0915
_repeated_fail_get_next_trial_count = 0
except Exception as e:
_repeated_fail_get_next_trial_count += 1
logger.debug(
"Worker '%s': Error while trying to get the next trial to evaluate.",
self.worker_id,
exc_info=True,
)
time.sleep(1) # Help stagger retries
if isinstance(e, portalocker.exceptions.LockException):
logger.debug(
"Worker '%s': Timeout while trying to get the next trial to"
" evaluate. If you are using a model based optimizer, such as"
" Bayesian Optimization, this can occur as the number of"
" configurations get large. There's not much to do here"
" and we will retry to obtain the lock.",
self.worker_id,
exc_info=True,
)
else:
logger.debug(
"Worker '%s': Error while trying to get the next trial to"
" evaluate.",
self.worker_id,
exc_info=True,
)
time.sleep(1) # Help stagger retries
# NOTE: This is to prevent any infinite loops if we can't get a trial
if _repeated_fail_get_next_trial_count >= MAX_RETRIES_GET_NEXT_TRIAL:
raise WorkerFailedToGetPendingTrialsError(
Expand Down

0 comments on commit f629096

Please sign in to comment.