diff --git a/aana/sdk.py b/aana/sdk.py index a49317e6..ea9554f1 100644 --- a/aana/sdk.py +++ b/aana/sdk.py @@ -45,7 +45,7 @@ def __init__( name (str, optional): The name of the application. Defaults to "app". migration_func (Callable | None): The migration function to run. Defaults to None. retryable_exceptions (list[Exception, str] | None): The exceptions that can be retried in the task queue. - Defaults to ['InferenceException']. + Defaults to ['InferenceException', 'ActorDiedError', 'OutOfMemoryError']. """ self.name = name self.migration_func = migration_func @@ -53,7 +53,11 @@ def __init__( self.deployments: dict[str, Deployment] = {} if retryable_exceptions is None: - self.retryable_exceptions = [InferenceException] + self.retryable_exceptions = [ + "InferenceException", + "ActorDiedError", + "OutOfMemoryError", + ] else: self.retryable_exceptions = retryable_exceptions # Convert exceptions to string if they are not already diff --git a/aana/utils/core.py b/aana/utils/core.py index 31d89236..6e27dea9 100644 --- a/aana/utils/core.py +++ b/aana/utils/core.py @@ -93,7 +93,9 @@ async def sleep_exponential_backoff( attempts (int): The number of attempts so far. jitter (bool): Whether to add jitter to the delay. Default is True. """ - delay = min(initial_delay * (2**attempts), max_delay) + # Prevent overflow by using min(attempt, 32) since 2^32 is already huge + capped_attempt = min(attempts, 32) + delay = min(initial_delay * (2**capped_attempt), max_delay) # Full jitter delay_with_jitter = random.uniform(0, delay) if jitter else delay # noqa: S311 await asyncio.sleep(delay_with_jitter)