From c90ff2af84782a3be63a167b349a6dbf764da878 Mon Sep 17 00:00:00 2001 From: Nicholas Mei Date: Tue, 17 Dec 2024 13:36:48 -0800 Subject: [PATCH] Add a retry to `detect_mount_points()` function During a merscope pipeline analysis run, a dist-data-sync: Batch Data Sync Batch SubmitJob API call failed due to the following proximal causes: ``` File "/var/task/aibs_informatics_aws_utils/efs/mount_point.py", line 385, in detect_mount_points batch_mp_configs = _detect_moint_points_from_batch_job(batch_job_id) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/var/task/aibs_informatics_aws_utils/efs/mount_point.py", line 464, in _detect_moint_points_from_batch_job response = batch.describe_jobs(jobs=[batch_job_id]) ``` The last `batch.describe_jobs()` call eventually resulted in the error: `botocore.exceptions.NoCredentialsError: Unable to locate credentials` This commit tries to fix this by adding a retry to the `detect_mount_points()` function (which calls the `_detect_moint_points_from_batch_job() function) as well as a lambda version. This retry will only do so if a `NoCredentialsError` is encountered under the assumption that such an error is ephemeral. --- src/aibs_informatics_aws_utils/efs/mount_point.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/aibs_informatics_aws_utils/efs/mount_point.py b/src/aibs_informatics_aws_utils/efs/mount_point.py index 7439302..d63f8fd 100644 --- a/src/aibs_informatics_aws_utils/efs/mount_point.py +++ b/src/aibs_informatics_aws_utils/efs/mount_point.py @@ -16,8 +16,10 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Union from aibs_informatics_core.models.aws.efs import AccessPointId, EFSPath, FileSystemId +from aibs_informatics_core.utils.decorators import retry from aibs_informatics_core.utils.hashing import sha256_hexdigest from aibs_informatics_core.utils.os_operations import get_env_var +from botocore.exceptions import NoCredentialsError from aibs_informatics_aws_utils.constants.efs import ( EFS_MOUNT_POINT_ID_VAR, @@ -377,6 +379,7 @@ def __repr__(self) -> str: @cache +@retry(retryable_exceptions=(NoCredentialsError), tries=5, backoff=2.0) def detect_mount_points() -> List[MountPointConfiguration]: mount_points: List[MountPointConfiguration] = []