Avoid scale-all optimization for all-or-nothing when single job

Avoid scale-all optimization for all-or-nothing when single job, so to avoid to scale twice in a row for the exact same nodes. Signed-off-by: Luca Carrogu <[email protected]>
aws · Oct 30, 2023 · 9912aff · 9912aff
1 parent e51284e
commit 9912aff
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 26 deletions.
diff --git a/src/slurm_plugin/instance_manager.py b/src/slurm_plugin/instance_manager.py
@@ -676,15 +676,17 @@ def _scaling_for_jobs_multi_node(
         update_node_address,
         scaling_strategy: ScalingStrategy,
     ):
-        # Optimize job level scaling with preliminary scale-all nodes attempt
-        self._update_dict(
-            self.unused_launched_instances,
-            self._launch_instances(
-                nodes_to_launch=self._parse_nodes_resume_list(node_list),
-                launch_batch_size=launch_batch_size,
-                scaling_strategy=scaling_strategy,
-            ),
-        )
+        if not (scaling_strategy in [ScalingStrategy.ALL_OR_NOTHING] and len(job_list) <= 1):
+            # Optimize job level scaling with preliminary scale-all nodes attempt
+            # Except for the case all-or-nothing / single job, to avoid scaling twice the same node list
+            self._update_dict(
+                self.unused_launched_instances,
+                self._launch_instances(
+                    nodes_to_launch=self._parse_nodes_resume_list(node_list),
+                    launch_batch_size=launch_batch_size,
+                    scaling_strategy=scaling_strategy,
+                ),
+            )
 
         # Avoid a job level launch if scaling strategy is BEST_EFFORT or GREEDY_ALL_OR_NOTHING
         # The scale all-in launch has been performed already hence from this point we want to skip the extra

diff --git a/tests/slurm_plugin/test_instance_manager.py b/tests/slurm_plugin/test_instance_manager.py
@@ -4171,7 +4171,8 @@ def test_all_or_nothing_node_assignment(
         "unused_launched_instances, "
         "mock_launch_instances, "
         "expected_unused_launched_instances, "
-        "expect_to_skip_job_level_launch",
+        "expect_to_skip_job_level_launch, "
+        "expect_optimize_launch_instances_called",
         [
             (
                 [],
@@ -4184,6 +4185,7 @@ def test_all_or_nothing_node_assignment(
                 {},
                 {},
                 True,
+                True,
             ),
             (
                 [],
@@ -4196,6 +4198,7 @@ def test_all_or_nothing_node_assignment(
                 {},
                 {},
                 False,
+                False,
             ),
             (
                 [],
@@ -4208,6 +4211,7 @@ def test_all_or_nothing_node_assignment(
                 {},
                 {},
                 True,
+                True,
             ),
             (
                 [],
@@ -4236,6 +4240,7 @@ def test_all_or_nothing_node_assignment(
                     }
                 },
                 True,
+                True,
             ),
             (
                 [],
@@ -4245,6 +4250,18 @@ def test_all_or_nothing_node_assignment(
                 False,
                 ScalingStrategy.ALL_OR_NOTHING,
                 {},
+                {},
+                {},
+                False,
+                False,
+            ),
+            (
+                [],
+                [],
+                1,
+                2,
+                True,
+                ScalingStrategy.ALL_OR_NOTHING,
                 {
                     "q1": {
                         "c1": [
@@ -4258,51 +4275,58 @@ def test_all_or_nothing_node_assignment(
                     "q1": {
                         "c1": [
                             EC2Instance(
-                                "i-12345", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)
+                                "i-12346", "ip.1.0.0.2", "ip-1-0-0-2", datetime(2020, 1, 1, tzinfo=timezone.utc)
                             )
                         ]
                     }
                 },
-                False,
-            ),
-            (
-                [],
-                [],
-                1,
-                2,
-                True,
-                ScalingStrategy.ALL_OR_NOTHING,
                 {
                     "q1": {
                         "c1": [
                             EC2Instance(
                                 "i-12345", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)
-                            )
+                            ),
                         ]
                     }
                 },
+                False,
+                False,
+            ),
+            (
+                [
+                    SlurmResumeJob(
+                        job_id=140816,
+                        nodes_alloc="queue3-st-c5xlarge-[7-10]",
+                        nodes_resume="queue3-st-c5xlarge-[7-9]",
+                        oversubscribe="NO",
+                    ),
+                ],
+                ["queue4-st-c5xlarge-1"],
+                3,
+                2,
+                True,
+                ScalingStrategy.ALL_OR_NOTHING,
                 {
                     "q1": {
                         "c1": [
                             EC2Instance(
-                                "i-12346", "ip.1.0.0.2", "ip-1-0-0-2", datetime(2020, 1, 1, tzinfo=timezone.utc)
+                                "i-12345", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)
                             )
                         ]
                     }
                 },
+                {},
                 {
                     "q1": {
                         "c1": [
                             EC2Instance(
                                 "i-12345", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)
                             ),
-                            EC2Instance(
-                                "i-12346", "ip.1.0.0.2", "ip-1-0-0-2", datetime(2020, 1, 1, tzinfo=timezone.utc)
-                            ),
                         ]
-                    }
+                    },
                 },
                 False,
+                False,
             ),
             (
                 [
@@ -4312,6 +4336,12 @@ def test_all_or_nothing_node_assignment(
                         nodes_resume="queue3-st-c5xlarge-[7-9]",
                         oversubscribe="NO",
                     ),
+                    SlurmResumeJob(
+                        job_id=140817,
+                        nodes_alloc="queue3-st-c5xlarge-[11-12]",
+                        nodes_resume="queue3-st-c5xlarge-[11-12]",
+                        oversubscribe="NO",
+                    ),
                 ],
                 ["queue4-st-c5xlarge-1"],
                 3,
@@ -4353,6 +4383,7 @@ def test_all_or_nothing_node_assignment(
                     },
                 },
                 False,
+                True,
             ),
         ],
     )
@@ -4370,6 +4401,7 @@ def test_scaling_for_jobs_multi_node(
         mock_launch_instances,
         expected_unused_launched_instances,
         expect_to_skip_job_level_launch,
+        expect_optimize_launch_instances_called,
     ):
         # patch internal functions
         instance_manager._launch_instances = mocker.MagicMock(return_value=mock_launch_instances)
@@ -4395,6 +4427,11 @@ def test_scaling_for_jobs_multi_node(
             skip_launch=expect_to_skip_job_level_launch,
         )
 
+        if expect_optimize_launch_instances_called:
+            instance_manager._launch_instances.assert_called_once()
+        else:
+            instance_manager._launch_instances.assert_not_called()
+
         assert_that(instance_manager.unused_launched_instances).is_equal_to(expected_unused_launched_instances)
 
     @pytest.mark.parametrize(