From bd8105d47bbf7cc81730575e565c70b66b531e41 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 17 Oct 2023 14:54:06 +0200 Subject: [PATCH 1/3] Raise exception when CreateFleet doesn't return any instance Raise an exception when CreateFleet doesn't return any instance and the error list in the CreateFleet contains only one error entry. The exception raised is built with the same error code coming from the CreateFleet response, so that this error code can be set into the reason when putting the Slurm nodes into DOWN. This is useful to avoid to trigger the fast capacity failover (error code InsufficientInstanceCapacity) when the CreateFleet call doesn't return any instance because of throttling (error code RequestLimitExceeded). Signed-off-by: Luca Carrogu --- src/slurm_plugin/fleet_manager.py | 13 ++++++++++++- src/slurm_plugin/instance_manager.py | 4 +++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/slurm_plugin/fleet_manager.py b/src/slurm_plugin/fleet_manager.py index 2e527fb06..edf84f8bf 100644 --- a/src/slurm_plugin/fleet_manager.py +++ b/src/slurm_plugin/fleet_manager.py @@ -71,6 +71,14 @@ def __init__(self, message: str): super().__init__(message) +class LaunchInstancesError(Exception): + """Represent an error during the launch of EC2 instances""" + + def __init__(self, code: str, message: str = ""): + self.code = code + super().__init__(message) + + class FleetManagerFactory: @staticmethod def get_manager( @@ -361,7 +369,8 @@ def _launch_instances(self, launch_params): instances = response.get("Instances", []) log_level = logging.WARNING if instances else logging.ERROR - for err in response.get("Errors", []): + err_list = response.get("Errors", []) + for err in err_list: logger.log( log_level, "Error in CreateFleet request (%s): %s - %s", @@ -375,6 +384,8 @@ def _launch_instances(self, launch_params): if partial_instance_ids: logger.error("Unable to retrieve instance info for instances: %s", partial_instance_ids) + if not instances and len(err_list) == 1: + raise LaunchInstancesError(err_list[0].get("ErrorCode"), err_list[0].get("ErrorMessage")) return {"Instances": instances} except ClientError as e: logger.error("Failed CreateFleet request: %s", e.response.get("ResponseMetadata", {}).get("RequestId")) diff --git a/src/slurm_plugin/instance_manager.py b/src/slurm_plugin/instance_manager.py index e021a613f..aa6675383 100644 --- a/src/slurm_plugin/instance_manager.py +++ b/src/slurm_plugin/instance_manager.py @@ -963,7 +963,7 @@ def all_or_nothing_node_assignment( # No instances launched at all, e.g. CreateFleet API returns no EC2 instances self._update_failed_nodes(set(nodes_resume_list), "InsufficientInstanceCapacity", override=False) - def _launch_instances( + def _launch_instances( # noqa: C901 self, nodes_to_launch: Dict[str, any], launch_batch_size: int, @@ -1016,6 +1016,8 @@ def _launch_instances( update_failed_nodes_parameters = {"nodeset": set(batch_nodes)} if isinstance(e, ClientError): update_failed_nodes_parameters["error_code"] = e.response.get("Error", {}).get("Code") + elif isinstance(e, Exception) and hasattr(e, "code"): + update_failed_nodes_parameters["error_code"] = e.code self._update_failed_nodes(**update_failed_nodes_parameters) if job and all_or_nothing_batch: From a28fad2ecd535bf22d3fb53ee7ff3687edaff846 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Wed, 18 Oct 2023 09:58:35 +0200 Subject: [PATCH 2/3] Refactor fleet_manager unit tests code Signed-off-by: Luca Carrogu --- src/slurm_plugin/fleet_manager.py | 6 +- tests/slurm_plugin/test_fleet_manager.py | 56 +++++++++---------- .../expected_launch_params.json | 0 .../expected_launch_params.json | 0 .../expected_launch_params.json | 0 .../expected_launch_params.json | 0 .../expected_launch_params.json | 0 .../expected_launch_params.json | 0 .../fleet_spot/expected_launch_params.json | 0 .../expected_launch_params.json | 0 10 files changed, 31 insertions(+), 31 deletions(-) rename tests/slurm_plugin/test_fleet_manager/{TestCreateFleetManager => TestEc2CreateFleetManager}/test_evaluate_launch_params/all_or_nothing/expected_launch_params.json (100%) rename tests/slurm_plugin/test_fleet_manager/{TestCreateFleetManager => TestEc2CreateFleetManager}/test_evaluate_launch_params/fleet-multi-az-multi-it-all_or_nothing/expected_launch_params.json (100%) rename tests/slurm_plugin/test_fleet_manager/{TestCreateFleetManager => TestEc2CreateFleetManager}/test_evaluate_launch_params/fleet-multi-az-multi-it/expected_launch_params.json (100%) rename tests/slurm_plugin/test_fleet_manager/{TestCreateFleetManager => TestEc2CreateFleetManager}/test_evaluate_launch_params/fleet-multi-az-single-it-all_or_nothing/expected_launch_params.json (100%) rename tests/slurm_plugin/test_fleet_manager/{TestCreateFleetManager => TestEc2CreateFleetManager}/test_evaluate_launch_params/fleet-single-az-multi-it-all_or_nothing/expected_launch_params.json (100%) rename tests/slurm_plugin/test_fleet_manager/{TestCreateFleetManager => TestEc2CreateFleetManager}/test_evaluate_launch_params/fleet_ondemand/expected_launch_params.json (100%) rename tests/slurm_plugin/test_fleet_manager/{TestCreateFleetManager => TestEc2CreateFleetManager}/test_evaluate_launch_params/fleet_spot/expected_launch_params.json (100%) rename tests/slurm_plugin/test_fleet_manager/{TestCreateFleetManager => TestEc2CreateFleetManager}/test_evaluate_launch_params/launch_overrides/expected_launch_params.json (100%) diff --git a/src/slurm_plugin/fleet_manager.py b/src/slurm_plugin/fleet_manager.py index edf84f8bf..a4c8ee064 100644 --- a/src/slurm_plugin/fleet_manager.py +++ b/src/slurm_plugin/fleet_manager.py @@ -72,7 +72,7 @@ def __init__(self, message: str): class LaunchInstancesError(Exception): - """Represent an error during the launch of EC2 instances""" + """Represent an error during the launch of EC2 instances.""" def __init__(self, code: str, message: str = ""): self.code = code @@ -161,11 +161,11 @@ def __init__( @abstractmethod def _evaluate_launch_params(self, count): - pass + """Evaluate parameters to be passed to run_instances call.""" @abstractmethod def _launch_instances(self, launch_params): - pass + """Launch a batch of ec2 instances.""" def launch_ec2_instances(self, count, job_id=None): """ diff --git a/tests/slurm_plugin/test_fleet_manager.py b/tests/slurm_plugin/test_fleet_manager.py index 76439b9c8..b7b65c033 100644 --- a/tests/slurm_plugin/test_fleet_manager.py +++ b/tests/slurm_plugin/test_fleet_manager.py @@ -219,33 +219,35 @@ def test_launch_instances(self, boto3_stubber, launch_params, mocked_boto3_reque # -------- Ec2CreateFleetManager ------ -test_fleet_exception_params = { - "LaunchTemplateConfigs": [ - { - "LaunchTemplateSpecification": {"LaunchTemplateName": "hit-queue1-fleet-spot", "Version": "$Latest"}, - "Overrides": [ - { - "InstanceRequirements": { - "VCpuCount": {"Min": 2}, - "MemoryMiB": {"Min": 2048}, - "AllowedInstanceTypes": ["inf*"], - "AcceleratorManufacturers": ["nvidia"], + +class TestEc2CreateFleetManager: + test_fleet_exception_params = { + "LaunchTemplateConfigs": [ + { + "LaunchTemplateSpecification": {"LaunchTemplateName": "hit-queue1-fleet-spot", "Version": "$Latest"}, + "Overrides": [ + { + "InstanceRequirements": { + "VCpuCount": {"Min": 2}, + "MemoryMiB": {"Min": 2048}, + "AllowedInstanceTypes": ["inf*"], + "AcceleratorManufacturers": ["nvidia"], + } } - } - ], - } - ], - "SpotOptions": { - "AllocationStrategy": "capacity-optimized", - "SingleInstanceType": False, - "SingleAvailabilityZone": True, - "MinTargetCapacity": 1, - }, - "TargetCapacitySpecification": {"TotalTargetCapacity": 5, "DefaultTargetCapacityType": "spot"}, - "Type": "instant", -} + ], + } + ], + "SpotOptions": { + "AllocationStrategy": "capacity-optimized", + "SingleInstanceType": False, + "SingleAvailabilityZone": True, + "MinTargetCapacity": 1, + }, + "TargetCapacitySpecification": {"TotalTargetCapacity": 5, "DefaultTargetCapacityType": "spot"}, + "Type": "instant", + } -test_fleet_spot_params = { + test_fleet_spot_params = { "LaunchTemplateConfigs": [ { "LaunchTemplateSpecification": {"LaunchTemplateName": "hit-queue1-fleet-spot", "Version": "$Latest"}, @@ -265,7 +267,7 @@ def test_launch_instances(self, boto3_stubber, launch_params, mocked_boto3_reque "Type": "instant", } -test_on_demand_params = { + test_on_demand_params = { "LaunchTemplateConfigs": [ { "LaunchTemplateSpecification": {"LaunchTemplateName": "hit-queue2-fleet-ondemand", "Version": "$Latest"}, @@ -286,8 +288,6 @@ def test_launch_instances(self, boto3_stubber, launch_params, mocked_boto3_reque "Type": "instant", } - -class TestCreateFleetManager: @pytest.mark.parametrize( ( "batch_size", diff --git a/tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/all_or_nothing/expected_launch_params.json b/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/all_or_nothing/expected_launch_params.json similarity index 100% rename from tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/all_or_nothing/expected_launch_params.json rename to tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/all_or_nothing/expected_launch_params.json diff --git a/tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it-all_or_nothing/expected_launch_params.json b/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it-all_or_nothing/expected_launch_params.json similarity index 100% rename from tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it-all_or_nothing/expected_launch_params.json rename to tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it-all_or_nothing/expected_launch_params.json diff --git a/tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it/expected_launch_params.json b/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it/expected_launch_params.json similarity index 100% rename from tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it/expected_launch_params.json rename to tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-multi-it/expected_launch_params.json diff --git a/tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet-multi-az-single-it-all_or_nothing/expected_launch_params.json b/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-single-it-all_or_nothing/expected_launch_params.json similarity index 100% rename from tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet-multi-az-single-it-all_or_nothing/expected_launch_params.json rename to tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-multi-az-single-it-all_or_nothing/expected_launch_params.json diff --git a/tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet-single-az-multi-it-all_or_nothing/expected_launch_params.json b/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-single-az-multi-it-all_or_nothing/expected_launch_params.json similarity index 100% rename from tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet-single-az-multi-it-all_or_nothing/expected_launch_params.json rename to tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet-single-az-multi-it-all_or_nothing/expected_launch_params.json diff --git a/tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet_ondemand/expected_launch_params.json b/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_ondemand/expected_launch_params.json similarity index 100% rename from tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet_ondemand/expected_launch_params.json rename to tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_ondemand/expected_launch_params.json diff --git a/tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet_spot/expected_launch_params.json b/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_spot/expected_launch_params.json similarity index 100% rename from tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/fleet_spot/expected_launch_params.json rename to tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_spot/expected_launch_params.json diff --git a/tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/launch_overrides/expected_launch_params.json b/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/launch_overrides/expected_launch_params.json similarity index 100% rename from tests/slurm_plugin/test_fleet_manager/TestCreateFleetManager/test_evaluate_launch_params/launch_overrides/expected_launch_params.json rename to tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/launch_overrides/expected_launch_params.json From c80a92587810961fc618c3346ffe6e7a018e652f Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Wed, 18 Oct 2023 10:27:42 +0200 Subject: [PATCH 3/3] Unit test for the LaunchInstancesError exception Signed-off-by: Luca Carrogu --- tests/slurm_plugin/test_fleet_manager.py | 128 ++++++++++++++------ tests/slurm_plugin/test_instance_manager.py | 72 ++++++++++- 2 files changed, 160 insertions(+), 40 deletions(-) diff --git a/tests/slurm_plugin/test_fleet_manager.py b/tests/slurm_plugin/test_fleet_manager.py index b7b65c033..d99585330 100644 --- a/tests/slurm_plugin/test_fleet_manager.py +++ b/tests/slurm_plugin/test_fleet_manager.py @@ -16,7 +16,13 @@ import pytest from assertpy import assert_that from botocore.exceptions import ClientError -from slurm_plugin.fleet_manager import Ec2CreateFleetManager, EC2Instance, Ec2RunInstancesManager, FleetManagerFactory +from slurm_plugin.fleet_manager import ( + Ec2CreateFleetManager, + EC2Instance, + Ec2RunInstancesManager, + FleetManagerFactory, + LaunchInstancesError, +) from tests.common import FLEET_CONFIG, MockedBoto3Request @@ -248,45 +254,48 @@ class TestEc2CreateFleetManager: } test_fleet_spot_params = { - "LaunchTemplateConfigs": [ - { - "LaunchTemplateSpecification": {"LaunchTemplateName": "hit-queue1-fleet-spot", "Version": "$Latest"}, - "Overrides": [ - {"MaxPrice": "10", "InstanceType": "t2.medium", "SubnetId": "1234567"}, - {"MaxPrice": "10", "InstanceType": "t2.large", "SubnetId": "1234567"}, - ], - } - ], - "SpotOptions": { - "AllocationStrategy": "capacity-optimized", - "SingleInstanceType": False, - "SingleAvailabilityZone": True, - "MinTargetCapacity": 1, - }, - "TargetCapacitySpecification": {"TotalTargetCapacity": 5, "DefaultTargetCapacityType": "spot"}, - "Type": "instant", -} + "LaunchTemplateConfigs": [ + { + "LaunchTemplateSpecification": {"LaunchTemplateName": "hit-queue1-fleet-spot", "Version": "$Latest"}, + "Overrides": [ + {"MaxPrice": "10", "InstanceType": "t2.medium", "SubnetId": "1234567"}, + {"MaxPrice": "10", "InstanceType": "t2.large", "SubnetId": "1234567"}, + ], + } + ], + "SpotOptions": { + "AllocationStrategy": "capacity-optimized", + "SingleInstanceType": False, + "SingleAvailabilityZone": True, + "MinTargetCapacity": 1, + }, + "TargetCapacitySpecification": {"TotalTargetCapacity": 5, "DefaultTargetCapacityType": "spot"}, + "Type": "instant", + } test_on_demand_params = { - "LaunchTemplateConfigs": [ - { - "LaunchTemplateSpecification": {"LaunchTemplateName": "hit-queue2-fleet-ondemand", "Version": "$Latest"}, - "Overrides": [ - {"InstanceType": "t2.medium", "SubnetId": "1234567"}, - {"InstanceType": "t2.large", "SubnetId": "1234567"}, - ], - } - ], - "OnDemandOptions": { - "AllocationStrategy": "lowest-price", - "SingleInstanceType": False, - "SingleAvailabilityZone": True, - "MinTargetCapacity": 1, - "CapacityReservationOptions": {"UsageStrategy": "use-capacity-reservations-first"}, - }, - "TargetCapacitySpecification": {"TotalTargetCapacity": 5, "DefaultTargetCapacityType": "on-demand"}, - "Type": "instant", -} + "LaunchTemplateConfigs": [ + { + "LaunchTemplateSpecification": { + "LaunchTemplateName": "hit-queue2-fleet-ondemand", + "Version": "$Latest", + }, + "Overrides": [ + {"InstanceType": "t2.medium", "SubnetId": "1234567"}, + {"InstanceType": "t2.large", "SubnetId": "1234567"}, + ], + } + ], + "OnDemandOptions": { + "AllocationStrategy": "lowest-price", + "SingleInstanceType": False, + "SingleAvailabilityZone": True, + "MinTargetCapacity": 1, + "CapacityReservationOptions": {"UsageStrategy": "use-capacity-reservations-first"}, + }, + "TargetCapacitySpecification": {"TotalTargetCapacity": 5, "DefaultTargetCapacityType": "on-demand"}, + "Type": "instant", + } @pytest.mark.parametrize( ( @@ -594,8 +603,45 @@ def test_evaluate_launch_params( } ], ), + # create-fleet - throttling + ( + test_on_demand_params, + [ + MockedBoto3Request( + method="create_fleet", + response={ + "Instances": [], + "Errors": [ + {"ErrorCode": "RequestLimitExceeded", "ErrorMessage": "Request limit exceeded."} + ], + "ResponseMetadata": {"RequestId": "37633199-bcc6-4a88-89e3-89d859d76096"}, + }, + expected_params=test_on_demand_params, + ), + ], + [], + ), + # create-fleet - multiple errors + ( + test_on_demand_params, + [ + MockedBoto3Request( + method="create_fleet", + response={ + "Instances": [], + "Errors": [ + {"ErrorCode": "RequestLimitExceeded", "ErrorMessage": "Request limit exceeded."}, + {"ErrorCode": "AnotherError", "ErrorMessage": "Something went wrong"}, + ], + "ResponseMetadata": {"RequestId": "37633199-bcc6-4a88-89e3-89d859d76096"}, + }, + expected_params=test_on_demand_params, + ), + ], + [], + ), ], - ids=["fleet_spot", "fleet_exception", "fleet_ondemand"], + ids=["fleet_spot", "fleet_exception", "fleet_ondemand", "fleet_throttling", "fleet_multiple_errors"], ) def test_launch_instances( self, @@ -617,6 +663,10 @@ def test_launch_instances( with pytest.raises(Exception) as e: fleet_manager._launch_instances(launch_params) assert isinstance(e, ClientError) + elif len(expected_assigned_nodes) == 0 and len(mocked_boto3_request[0].response.get("Errors")) == 1: + with pytest.raises(LaunchInstancesError) as e: + fleet_manager._launch_instances(launch_params) + assert isinstance(e, LaunchInstancesError) else: assigned_nodes = fleet_manager._launch_instances(launch_params) assert_that(assigned_nodes.get("Instances", [])).is_equal_to(expected_assigned_nodes) diff --git a/tests/slurm_plugin/test_instance_manager.py b/tests/slurm_plugin/test_instance_manager.py index f1d87d4e9..9894b4bda 100644 --- a/tests/slurm_plugin/test_instance_manager.py +++ b/tests/slurm_plugin/test_instance_manager.py @@ -23,7 +23,7 @@ import slurm_plugin from assertpy import assert_that from slurm_plugin.common import ScalingStrategy -from slurm_plugin.fleet_manager import EC2Instance +from slurm_plugin.fleet_manager import EC2Instance, LaunchInstancesError from slurm_plugin.instance_manager import ( HostnameDnsStoreError, InstanceManager, @@ -3252,6 +3252,76 @@ def test_node_assignment_by_scaling_strategy( {}, {"InsufficientInstanceCapacity": {"queue4-st-c5xlarge-1"}}, ), + ( + None, + { + "queue1": {"c52xlarge": ["queue1-st-c52xlarge-1"]}, + "queue2": {"c5xlarge": ["queue2-st-c5xlarge-1"]}, + "queue3": {"p4d24xlarge": ["queue3-st-p4d24xlarge-1"]}, + }, + 15, + {}, + [ + { + "Instances": [ + { + "InstanceId": "i-12345", + "InstanceType": "c5.2xlarge", + "PrivateIpAddress": "ip.1.0.0.1", + "PrivateDnsName": "ip-1-0-0-1", + "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), + "NetworkInterfaces": [ + { + "Attachment": { + "DeviceIndex": 0, + "NetworkCardIndex": 0, + }, + "PrivateIpAddress": "ip.1.0.0.1", + }, + ], + } + ], + }, + LaunchInstancesError("throttling", "got throttled"), + { + "Instances": [ + { + "InstanceId": "i-123456", + "InstanceType": "p4d24xlarge", + "PrivateIpAddress": "ip.1.0.0.2", + "PrivateDnsName": "ip-1-0-0-2", + "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), + "NetworkInterfaces": [ + { + "Attachment": { + "DeviceIndex": 0, + "NetworkCardIndex": 0, + }, + "PrivateIpAddress": "ip.1.0.0.2", + }, + ], + } + ], + }, + ], + { + "queue1": { + "c52xlarge": [ + EC2Instance( + "i-12345", "ip.1.0.0.1", "ip-1-0-0-1", datetime(2020, 1, 1, tzinfo=timezone.utc) + ) + ] + }, + "queue3": { + "p4d24xlarge": [ + EC2Instance( + "i-123456", "ip.1.0.0.2", "ip-1-0-0-2", datetime(2020, 1, 1, tzinfo=timezone.utc) + ) + ] + }, + }, + {"throttling": {"queue2-st-c5xlarge-1"}}, + ), ], ) def test_launch_instances(