Skip to content

Commit

Permalink
Change retry logic on DescribeInstances API call
Browse files Browse the repository at this point in the history
Change retry logic on DescribeInstances to be exponential backoff plus a random number in the internal 0 + 0.5.
 The random number is to add a jitter so to avoid wave requests.

Retries have been increased from 4 to 5

Signed-off-by: Luca Carrogu <[email protected]>
  • Loading branch information
lukeseawalker committed Sep 6, 2023
1 parent 825f050 commit 16b8026
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 7 deletions.
12 changes: 7 additions & 5 deletions src/slurm_plugin/fleet_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import contextlib
import copy
import logging
import secrets
import time
from abc import ABC, abstractmethod

Expand Down Expand Up @@ -383,15 +384,16 @@ def _get_instances_info(self, instance_ids: list):
instances = []
partial_instance_ids = instance_ids

retry = 4
retries = 5
attempt_count = 0
# Wait for instances to be available in EC2
time.sleep(0.1)
while retry > 0 and partial_instance_ids:
while attempt_count < retries and partial_instance_ids:
complete_instances, partial_instance_ids = self._retrieve_instances_info_from_ec2(partial_instance_ids)
instances.extend(complete_instances)
retry = retry - 1
if retry > 0:
time.sleep(0.3)
attempt_count += 1
if attempt_count < retries:
time.sleep(0.3 * 2**attempt_count + (secrets.randbelow(500) / 1000))

return instances, partial_instance_ids

Expand Down
4 changes: 2 additions & 2 deletions tests/slurm_plugin/test_fleet_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ def test_launch_instances(
generate_error=False,
),
]
+ 3
+ 4
* [
MockedBoto3Request(
method="describe_instances",
Expand Down Expand Up @@ -887,7 +887,7 @@ def test_launch_instances(
# client error
(
["i-12345"],
4
5
* [
MockedBoto3Request(
method="describe_instances",
Expand Down

0 comments on commit 16b8026

Please sign in to comment.