diff --git a/src/slurm_plugin/fleet_manager.py b/src/slurm_plugin/fleet_manager.py index 4d99db370..6f20f458a 100644 --- a/src/slurm_plugin/fleet_manager.py +++ b/src/slurm_plugin/fleet_manager.py @@ -11,6 +11,7 @@ import contextlib import copy import logging +import random import time from abc import ABC, abstractmethod @@ -383,15 +384,16 @@ def _get_instances_info(self, instance_ids: list): instances = [] partial_instance_ids = instance_ids - retry = 4 + retries = 5 + attempt_count = 0 # Wait for instances to be available in EC2 time.sleep(0.1) - while retry > 0 and partial_instance_ids: + while attempt_count < retries and partial_instance_ids: complete_instances, partial_instance_ids = self._retrieve_instances_info_from_ec2(partial_instance_ids) instances.extend(complete_instances) - retry = retry - 1 - if retry > 0: - time.sleep(0.3) + attempt_count += 1 + if attempt_count < retries: + time.sleep(0.3 * 2**attempt_count + random.uniform(0, 0.5)) # nosec B311 return instances, partial_instance_ids diff --git a/tests/slurm_plugin/test_fleet_manager.py b/tests/slurm_plugin/test_fleet_manager.py index 795f43fd1..76439b9c8 100644 --- a/tests/slurm_plugin/test_fleet_manager.py +++ b/tests/slurm_plugin/test_fleet_manager.py @@ -834,7 +834,7 @@ def test_launch_instances( generate_error=False, ), ] - + 3 + + 4 * [ MockedBoto3Request( method="describe_instances", @@ -887,7 +887,7 @@ def test_launch_instances( # client error ( ["i-12345"], - 4 + 5 * [ MockedBoto3Request( method="describe_instances",