Skip to content

Commit

Permalink
Avoid scale-all optimization for all-or-nothing when single job
Browse files Browse the repository at this point in the history
Avoid scale-all optimization for all-or-nothing when single job, so to avoid to scale twice in a row for the exact same nodes.

Signed-off-by: Luca Carrogu <[email protected]>
  • Loading branch information
lukeseawalker committed Oct 30, 2023
1 parent e51284e commit 9912aff
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 26 deletions.
20 changes: 11 additions & 9 deletions src/slurm_plugin/instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,15 +676,17 @@ def _scaling_for_jobs_multi_node(
update_node_address,
scaling_strategy: ScalingStrategy,
):
# Optimize job level scaling with preliminary scale-all nodes attempt
self._update_dict(
self.unused_launched_instances,
self._launch_instances(
nodes_to_launch=self._parse_nodes_resume_list(node_list),
launch_batch_size=launch_batch_size,
scaling_strategy=scaling_strategy,
),
)
if not (scaling_strategy in [ScalingStrategy.ALL_OR_NOTHING] and len(job_list) <= 1):
# Optimize job level scaling with preliminary scale-all nodes attempt
# Except for the case all-or-nothing / single job, to avoid scaling twice the same node list
self._update_dict(
self.unused_launched_instances,
self._launch_instances(
nodes_to_launch=self._parse_nodes_resume_list(node_list),
launch_batch_size=launch_batch_size,
scaling_strategy=scaling_strategy,
),
)

# Avoid a job level launch if scaling strategy is BEST_EFFORT or GREEDY_ALL_OR_NOTHING
# The scale all-in launch has been performed already hence from this point we want to skip the extra
Expand Down
71 changes: 54 additions & 17 deletions tests/slurm_plugin/test_instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4171,7 +4171,8 @@ def test_all_or_nothing_node_assignment(
"unused_launched_instances, "
"mock_launch_instances, "
"expected_unused_launched_instances, "
"expect_to_skip_job_level_launch",
"expect_to_skip_job_level_launch, "
"expect_optimize_launch_instances_called",
[
(
[],
Expand All @@ -4184,6 +4185,7 @@ def test_all_or_nothing_node_assignment(
{},
{},
True,
True,
),
(
[],
Expand All @@ -4196,6 +4198,7 @@ def test_all_or_nothing_node_assignment(
{},
{},
False,
False,
),
(
[],
Expand All @@ -4208,6 +4211,7 @@ def test_all_or_nothing_node_assignment(
{},
{},
True,
True,
),
(
[],
Expand Down Expand Up @@ -4236,6 +4240,7 @@ def test_all_or_nothing_node_assignment(
}
},
True,
True,
),
(
[],
Expand All @@ -4245,6 +4250,18 @@ def test_all_or_nothing_node_assignment(
False,
ScalingStrategy.ALL_OR_NOTHING,
{},
{},
{},
False,
False,
),
(
[],
[],
1,
2,
True,
ScalingStrategy.ALL_OR_NOTHING,
{
"q1": {
"c1": [
Expand All @@ -4258,51 +4275,58 @@ def test_all_or_nothing_node_assignment(
"q1": {
"c1": [
EC2Instance(
"i-12345", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)
"i-12346", "ip.1.0.0.2", "ip-1-0-0-2", datetime(2020, 1, 1, tzinfo=timezone.utc)
)
]
}
},
False,
),
(
[],
[],
1,
2,
True,
ScalingStrategy.ALL_OR_NOTHING,
{
"q1": {
"c1": [
EC2Instance(
"i-12345", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)
)
),
]
}
},
False,
False,
),
(
[
SlurmResumeJob(
job_id=140816,
nodes_alloc="queue3-st-c5xlarge-[7-10]",
nodes_resume="queue3-st-c5xlarge-[7-9]",
oversubscribe="NO",
),
],
["queue4-st-c5xlarge-1"],
3,
2,
True,
ScalingStrategy.ALL_OR_NOTHING,
{
"q1": {
"c1": [
EC2Instance(
"i-12346", "ip.1.0.0.2", "ip-1-0-0-2", datetime(2020, 1, 1, tzinfo=timezone.utc)
"i-12345", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)
)
]
}
},
{},
{
"q1": {
"c1": [
EC2Instance(
"i-12345", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)
),
EC2Instance(
"i-12346", "ip.1.0.0.2", "ip-1-0-0-2", datetime(2020, 1, 1, tzinfo=timezone.utc)
),
]
}
},
},
False,
False,
),
(
[
Expand All @@ -4312,6 +4336,12 @@ def test_all_or_nothing_node_assignment(
nodes_resume="queue3-st-c5xlarge-[7-9]",
oversubscribe="NO",
),
SlurmResumeJob(
job_id=140817,
nodes_alloc="queue3-st-c5xlarge-[11-12]",
nodes_resume="queue3-st-c5xlarge-[11-12]",
oversubscribe="NO",
),
],
["queue4-st-c5xlarge-1"],
3,
Expand Down Expand Up @@ -4353,6 +4383,7 @@ def test_all_or_nothing_node_assignment(
},
},
False,
True,
),
],
)
Expand All @@ -4370,6 +4401,7 @@ def test_scaling_for_jobs_multi_node(
mock_launch_instances,
expected_unused_launched_instances,
expect_to_skip_job_level_launch,
expect_optimize_launch_instances_called,
):
# patch internal functions
instance_manager._launch_instances = mocker.MagicMock(return_value=mock_launch_instances)
Expand All @@ -4395,6 +4427,11 @@ def test_scaling_for_jobs_multi_node(
skip_launch=expect_to_skip_job_level_launch,
)

if expect_optimize_launch_instances_called:
instance_manager._launch_instances.assert_called_once()
else:
instance_manager._launch_instances.assert_not_called()

assert_that(instance_manager.unused_launched_instances).is_equal_to(expected_unused_launched_instances)

@pytest.mark.parametrize(
Expand Down

0 comments on commit 9912aff

Please sign in to comment.