diff --git a/src/slurm_plugin/instance_manager.py b/src/slurm_plugin/instance_manager.py index 68fa9aa0e..65b0d3d8b 100644 --- a/src/slurm_plugin/instance_manager.py +++ b/src/slurm_plugin/instance_manager.py @@ -555,15 +555,7 @@ def add_instances( else: logger.error( "Not possible to perform job level scaling because Slurm resume file content is empty. " - "Falling back to node list scaling" - ) - logger.info("The nodes_resume list from Slurm Resume Program is %s", print_with_count(node_list)) - self._add_instances_for_nodes( - node_list=node_list, - launch_batch_size=launch_batch_size, - assign_node_batch_size=assign_node_batch_size, - update_node_address=update_node_address, - scaling_strategy=scaling_strategy, + "No scaling actions will be taken." ) self._terminate_unassigned_launched_instances(terminate_batch_size) diff --git a/tests/slurm_plugin/test_instance_manager.py b/tests/slurm_plugin/test_instance_manager.py index 2345e933b..6c9daf3ce 100644 --- a/tests/slurm_plugin/test_instance_manager.py +++ b/tests/slurm_plugin/test_instance_manager.py @@ -1363,13 +1363,7 @@ def test_add_instances( scaling_strategy=scaling_strategy, ) else: - instance_manager._add_instances_for_nodes.assert_called_once_with( - node_list=node_list, - launch_batch_size=launch_batch_size, - assign_node_batch_size=assign_node_batch_size, - update_node_address=update_node_address, - scaling_strategy=scaling_strategy, - ) + instance_manager._add_instances_for_resume_file.assert_not_called() class TestJobLevelScalingInstanceManager: @@ -1470,13 +1464,6 @@ def test_add_instances( if not slurm_resume: instance_manager._add_instances_for_resume_file.assert_not_called() - instance_manager._add_instances_for_nodes.assert_called_with( - node_list=node_list, - launch_batch_size=launch_batch_size, - assign_node_batch_size=assign_node_batch_size, - update_node_address=update_node_address, - scaling_strategy=scaling_strategy, - ) assert_that(caplog.text).contains( "Not possible to perform job level scaling " "because Slurm resume file content is empty." ) diff --git a/tests/slurm_plugin/test_resume.py b/tests/slurm_plugin/test_resume.py index d2e9007ba..45beee49a 100644 --- a/tests/slurm_plugin/test_resume.py +++ b/tests/slurm_plugin/test_resume.py @@ -379,376 +379,6 @@ def test_resume_config(config_file, expected_attributes, test_datadir, mocker): False, False, ), - # job level scaling + empty resume file + all_or_nothing_batch with ICE error - ( - [ - SimpleNamespace(name="queue1-dy-c5xlarge-1", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-dy-c5xlarge-2", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-st-c5xlarge-1", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-st-c5xlarge-2", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - ], - 3, - ScalingStrategy.ALL_OR_NOTHING, - [ - { - "Instances": [ - { - "InstanceId": "i-11111", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.1", - "PrivateDnsName": "ip-1-0-0-1", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.1", - }, - ], - }, - { - "InstanceId": "i-22222", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.2", - "PrivateDnsName": "ip-1-0-0-2", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.2", - }, - ], - }, - { - "InstanceId": "i-33333", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.3", - "PrivateDnsName": "ip-1-0-0-3", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.3", - }, - ], - }, - ] - }, - client_error("InsufficientInstanceCapacity"), - ], - {"InsufficientInstanceCapacity": {"queue1-st-c5xlarge-2"}}, - [], - {}, - True, - True, - ), - # job level scaling + empty resume file + all_or_nothing without error - ( - [ - SimpleNamespace(name="queue1-dy-c5xlarge-1", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-dy-c5xlarge-2", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-st-c5xlarge-1", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-st-c5xlarge-2", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - ], - 3, - ScalingStrategy.ALL_OR_NOTHING, - [ - { - "Instances": [ - { - "InstanceId": "i-11111", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.1", - "PrivateDnsName": "ip-1-0-0-1", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.1", - }, - ], - }, - { - "InstanceId": "i-22222", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.2", - "PrivateDnsName": "ip-1-0-0-2", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.2", - }, - ], - }, - { - "InstanceId": "i-33333", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.3", - "PrivateDnsName": "ip-1-0-0-3", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.3", - }, - ], - }, - { - "InstanceId": "i-44444", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.4", - "PrivateDnsName": "ip-1-0-0-4", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.4", - }, - ], - }, - ] - }, - ], - {}, - [ - call( - ["queue1-dy-c5xlarge-1", "queue1-dy-c5xlarge-2", "queue1-st-c5xlarge-1", "queue1-st-c5xlarge-2"], - nodeaddrs=["ip.1.0.0.1", "ip.1.0.0.2", "ip.1.0.0.3", "ip.1.0.0.4"], - nodehostnames=None, - ) - ], - dict( - zip( - ["queue1-dy-c5xlarge-1", "queue1-dy-c5xlarge-2", "queue1-st-c5xlarge-1", "queue1-st-c5xlarge-2"], - [ - EC2Instance("i-11111", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)), - EC2Instance("i-22222", "ip.1.0.0.2", "ip-1-0-0-2", datetime(2020, 1, 1, tzinfo=timezone.utc)), - EC2Instance("i-33333", "ip.1.0.0.3", "ip-1-0-0-3", datetime(2020, 1, 1, tzinfo=timezone.utc)), - EC2Instance("i-44444", "ip.1.0.0.4", "ip-1-0-0-4", datetime(2020, 1, 1, tzinfo=timezone.utc)), - ], - ) - ), - True, - True, - ), - # job level scaling + empty resume file + best_effort with CLIENT error - ( - [ - SimpleNamespace(name="queue1-dy-c5xlarge-1", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-dy-c5xlarge-2", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-st-c5xlarge-1", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-st-c5xlarge-2", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - ], - 3, - ScalingStrategy.BEST_EFFORT, - [ - { - "Instances": [ - { - "InstanceId": "i-11111", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.1", - "PrivateDnsName": "ip-1-0-0-1", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.1", - }, - ], - }, - ] - }, - client_error("ServiceUnavailable"), - ], - { - "ServiceUnavailable": {"queue1-st-c5xlarge-2"}, - "LimitedInstanceCapacity": {"queue1-dy-c5xlarge-2", "queue1-st-c5xlarge-1"}, - }, - [call(["queue1-dy-c5xlarge-1"], nodeaddrs=["ip.1.0.0.1"], nodehostnames=None)], - dict( - zip( - ["queue1-dy-c5xlarge-1"], - [ - EC2Instance("i-11111", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)), - ], - ) - ), - True, - True, - ), - # job level scaling + empty resume file + best_effort wit ICE error - ( - [ - SimpleNamespace(name="queue1-dy-c5xlarge-1", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-dy-c5xlarge-2", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-st-c5xlarge-1", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-st-c5xlarge-2", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - ], - 3, - ScalingStrategy.BEST_EFFORT, - [ - { - "Instances": [ - { - "InstanceId": "i-11111", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.1", - "PrivateDnsName": "ip-1-0-0-1", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.1", - }, - ], - }, - ] - }, - client_error("InsufficientReservedInstanceCapacity"), - ], - {"InsufficientReservedInstanceCapacity": {"queue1-st-c5xlarge-2"}}, - [call(["queue1-dy-c5xlarge-1"], nodeaddrs=["ip.1.0.0.1"], nodehostnames=None)], - dict( - zip( - ["queue1-dy-c5xlarge-1"], - [ - EC2Instance("i-11111", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)), - ], - ) - ), - True, - True, - ), - # job level scaling + empty resume file + best-effort without error - ( - [ - SimpleNamespace(name="queue1-dy-c5xlarge-1", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-dy-c5xlarge-2", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-st-c5xlarge-1", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - SimpleNamespace(name="queue1-st-c5xlarge-2", state_string="ALLOCATED+CLOUD+NOT_RESPONDING+POWERING_UP"), - ], - 3, - ScalingStrategy.BEST_EFFORT, - [ - { - "Instances": [ - { - "InstanceId": "i-11111", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.1", - "PrivateDnsName": "ip-1-0-0-1", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.1", - }, - ], - }, - { - "InstanceId": "i-22222", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.2", - "PrivateDnsName": "ip-1-0-0-2", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.2", - }, - ], - }, - { - "InstanceId": "i-33333", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.3", - "PrivateDnsName": "ip-1-0-0-3", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.3", - }, - ], - }, - { - "InstanceId": "i-44444", - "InstanceType": "c5.xlarge", - "PrivateIpAddress": "ip.1.0.0.4", - "PrivateDnsName": "ip-1-0-0-4", - "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), - "NetworkInterfaces": [ - { - "Attachment": { - "DeviceIndex": 0, - "NetworkCardIndex": 0, - }, - "PrivateIpAddress": "ip.1.0.0.4", - }, - ], - }, - ] - }, - ], - {}, - [ - call( - ["queue1-dy-c5xlarge-1", "queue1-dy-c5xlarge-2", "queue1-st-c5xlarge-1", "queue1-st-c5xlarge-2"], - nodeaddrs=["ip.1.0.0.1", "ip.1.0.0.2", "ip.1.0.0.3", "ip.1.0.0.4"], - nodehostnames=None, - ) - ], - dict( - zip( - ["queue1-dy-c5xlarge-1", "queue1-dy-c5xlarge-2", "queue1-st-c5xlarge-1", "queue1-st-c5xlarge-2"], - [ - EC2Instance("i-11111", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc)), - EC2Instance("i-22222", "ip.1.0.0.2", "ip-1-0-0-2", datetime(2020, 1, 1, tzinfo=timezone.utc)), - EC2Instance("i-33333", "ip.1.0.0.3", "ip-1-0-0-3", datetime(2020, 1, 1, tzinfo=timezone.utc)), - EC2Instance("i-44444", "ip.1.0.0.4", "ip-1-0-0-4", datetime(2020, 1, 1, tzinfo=timezone.utc)), - ], - ) - ), - True, - True, - ), ], ids=[ "node list scaling + all_or_nothing without ICE error", @@ -756,11 +386,6 @@ def test_resume_config(config_file, expected_attributes, test_datadir, mocker): "node list scaling + best_effort without ICE error", "node list scaling + best_effort with ICE error", "invalid_heartbeat", - "job level scaling + empty resume file + all_or_nothing with ICE error", - "job level scaling + empty resume file + all_or_nothing without error", - "job level scaling + empty resume file + best_effort with CLIENT error", - "job level scaling + empty resume file + best_effort with ICE error", - "job level scaling + empty resume file + best_effort without error", ], ) def test_resume_launch(