Skip to content

Commit

Permalink
Change driver retries param to total_attempts
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-eq committed Aug 30, 2024
1 parent f4ffb68 commit e68b3be
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 13 deletions.
6 changes: 3 additions & 3 deletions src/ert/scheduler/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ async def _execute_with_retry(
retry_codes: Iterable[int] = (),
accept_codes: Iterable[int] = (),
stdin: Optional[bytes] = None,
retries: int = 1,
total_attempts: int = 1,
retry_interval: float = 1.0,
driverlogger: Optional[logging.Logger] = None,
exit_on_msgs: Iterable[str] = (),
Expand All @@ -89,7 +89,7 @@ async def _execute_with_retry(
_logger = driverlogger or logging.getLogger(__name__)
error_message: Optional[str] = None

for _ in range(retries):
for _ in range(total_attempts):
process = await asyncio.create_subprocess_exec(
*cmd_with_args,
stdin=asyncio.subprocess.PIPE if stdin else None,
Expand Down Expand Up @@ -139,7 +139,7 @@ async def _execute_with_retry(

await asyncio.sleep(retry_interval)
error_message = (
f'Command "{shlex.join(cmd_with_args)}" failed after {retries} retries '
f'Command "{shlex.join(cmd_with_args)}" failed after {total_attempts} attempts '
f"with {outputs}"
)
_logger.error(error_message)
Expand Down
10 changes: 5 additions & 5 deletions src/ert/scheduler/lsf_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ async def submit(
bsub_with_args,
retry_on_empty_stdout=True,
retry_codes=(FLAKY_SSH_RETURNCODE,),
retries=self._bsub_retries,
total_attempts=self._bsub_retries,
retry_interval=self._sleep_time_between_cmd_retries,
)
if not process_success:
Expand Down Expand Up @@ -384,7 +384,7 @@ async def kill(self, iens: int) -> None:
_, process_message = await self._execute_with_retry(
bkill_with_args,
retry_codes=(FLAKY_SSH_RETURNCODE,),
retries=3,
total_attempts=3,
retry_interval=self._sleep_time_between_cmd_retries,
exit_on_msgs=(JOB_ALREADY_FINISHED_BKILL_MSG),
)
Expand Down Expand Up @@ -496,7 +496,7 @@ async def _get_exit_code(self, job_id: str) -> int:
success, output = await self._execute_with_retry(
[f"{self._bjobs_cmd}", "-o exit_code", "-noheader", f"{job_id}"],
retry_codes=(FLAKY_SSH_RETURNCODE,),
retries=3,
total_attempts=3,
retry_interval=self._sleep_time_between_cmd_retries,
)

Expand All @@ -514,7 +514,7 @@ async def _get_exit_code_from_bhist(self, job_id: str) -> int:
success, output = await self._execute_with_retry(
[f"{self._bhist_cmd}", "-l", "-n2", f"{job_id}"],
retry_codes=(FLAKY_SSH_RETURNCODE,),
retries=3,
total_attempts=3,
retry_interval=self._sleep_time_between_cmd_retries,
)

Expand All @@ -534,7 +534,7 @@ async def _log_bhist_job_summary(self, job_id: str) -> None:
_, process_message = await self._execute_with_retry(
bhist_with_args,
retry_codes=(FLAKY_SSH_RETURNCODE,),
retries=3,
total_attempts=3,
retry_interval=self._sleep_time_between_cmd_retries,
log_to_debug=False,
)
Expand Down
4 changes: 2 additions & 2 deletions src/ert/scheduler/openpbs_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ async def submit(
QSUB_CONNECTION_REFUSED,
),
stdin=script.encode(encoding="utf-8"),
retries=self._num_pbs_cmd_retries,
total_attempts=self._num_pbs_cmd_retries,
retry_interval=self._sleep_time_between_cmd_retries,
driverlogger=logger,
)
Expand Down Expand Up @@ -298,7 +298,7 @@ async def kill(self, iens: int) -> None:
[str(self._qdel_cmd), str(job_id)],
retry_codes=(QDEL_REQUEST_INVALID,),
accept_codes=(QDEL_JOB_HAS_FINISHED,),
retries=self._num_pbs_cmd_retries,
total_attempts=self._num_pbs_cmd_retries,
retry_interval=self._sleep_time_between_cmd_retries,
driverlogger=logger,
)
Expand Down
2 changes: 1 addition & 1 deletion src/ert/scheduler/slurm_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ async def submit(
sbatch_with_args,
retry_on_empty_stdout=True,
retry_codes=(),
retries=self._sbatch_retries,
total_attempts=self._sbatch_retries,
retry_interval=self._sleep_time_between_cmd_retries,
)
if not process_success:
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/scheduler/test_lsf_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ async def test_that_bsub_will_retry_and_fail(
driver._bsub_retries = 2
driver._sleep_time_between_cmd_retries = 0.2
match_str = (
f'failed after 2 retries with exit code {exit_code}.*error: "{error_msg if error_msg else "<empty>"}"'
f'failed after 2 attempts with exit code {exit_code}.*error: "{error_msg if error_msg else "<empty>"}"'
if exit_code != 199
else 'failed with exit code 199.*error: "Not recognized"'
)
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/scheduler/test_openpbs_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ async def test_that_qsub_will_retry_and_fail(
driver._num_pbs_cmd_retries = 2
driver._sleep_time_between_cmd_retries = 0.2
match_str = (
f'failed after 2 retries with exit code {exit_code}.*error: "{error_msg}"'
f'failed after 2 attempts with exit code {exit_code}.*error: "{error_msg}"'
if exit_code != 199
else 'failed with exit code 199.*error: "Not recognized"'
)
Expand Down

0 comments on commit e68b3be

Please sign in to comment.