Skip to content

Commit

Permalink
Avoid scaling if content of SLURM_RESUME_FILE is empty
Browse files Browse the repository at this point in the history
Avoid scaling if content of SLURM_RESUME_FILE is empty, logging an error.

Signed-off-by: Luca Carrogu <[email protected]>
  • Loading branch information
lukeseawalker committed Oct 12, 2023
1 parent ee6a953 commit d70da33
Show file tree
Hide file tree
Showing 3 changed files with 2 additions and 398 deletions.
10 changes: 1 addition & 9 deletions src/slurm_plugin/instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,15 +555,7 @@ def add_instances(
else:
logger.error(
"Not possible to perform job level scaling because Slurm resume file content is empty. "
"Falling back to node list scaling"
)
logger.info("The nodes_resume list from Slurm Resume Program is %s", print_with_count(node_list))
self._add_instances_for_nodes(
node_list=node_list,
launch_batch_size=launch_batch_size,
assign_node_batch_size=assign_node_batch_size,
update_node_address=update_node_address,
scaling_strategy=scaling_strategy,
"No scaling actions will be taken."
)

self._terminate_unassigned_launched_instances(terminate_batch_size)
Expand Down
15 changes: 1 addition & 14 deletions tests/slurm_plugin/test_instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1363,13 +1363,7 @@ def test_add_instances(
scaling_strategy=scaling_strategy,
)
else:
instance_manager._add_instances_for_nodes.assert_called_once_with(
node_list=node_list,
launch_batch_size=launch_batch_size,
assign_node_batch_size=assign_node_batch_size,
update_node_address=update_node_address,
scaling_strategy=scaling_strategy,
)
instance_manager._add_instances_for_resume_file.assert_not_called()


class TestJobLevelScalingInstanceManager:
Expand Down Expand Up @@ -1470,13 +1464,6 @@ def test_add_instances(

if not slurm_resume:
instance_manager._add_instances_for_resume_file.assert_not_called()
instance_manager._add_instances_for_nodes.assert_called_with(
node_list=node_list,
launch_batch_size=launch_batch_size,
assign_node_batch_size=assign_node_batch_size,
update_node_address=update_node_address,
scaling_strategy=scaling_strategy,
)
assert_that(caplog.text).contains(
"Not possible to perform job level scaling " "because Slurm resume file content is empty."
)
Expand Down
Loading

0 comments on commit d70da33

Please sign in to comment.