From 525faf9232bb93ad6567b118b421921067c79fa6 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 16 Oct 2023 10:56:14 +0200 Subject: [PATCH 01/35] Fix Node class to use in mocks The test is for static nodes, not dynamic. Signed-off-by: Enrico Usai --- tests/slurm_plugin/test_clustermgtd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py index 9da4032b0..aed08a6c0 100644 --- a/tests/slurm_plugin/test_clustermgtd.py +++ b/tests/slurm_plugin/test_clustermgtd.py @@ -735,9 +735,9 @@ def test_handle_health_check( ( {"queue1-st-c5xlarge-1", "queue1-st-c5xlarge-2", "queue1-st-c5xlarge-4"}, [ - DynamicNode("queue1-st-c5xlarge-1", "ip", "hostname", "IDLE+CLOUD", "queue1"), - DynamicNode("queue1-st-c5xlarge-2", "ip", "hostname", "DOWN+CLOUD", "queue1"), - DynamicNode("queue1-st-c5xlarge-3", "ip", "hostname", "IDLE+CLOUD", "queue1"), + StaticNode("queue1-st-c5xlarge-1", "ip", "hostname", "IDLE+CLOUD", "queue1"), + StaticNode("queue1-st-c5xlarge-2", "ip", "hostname", "DOWN+CLOUD", "queue1"), + StaticNode("queue1-st-c5xlarge-3", "ip", "hostname", "IDLE+CLOUD", "queue1"), ], {"queue1-st-c5xlarge-2"}, ) From d88054eb16a5d85208021140716c14d41936e118 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 30 Oct 2023 12:21:02 +0100 Subject: [PATCH 02/35] Improve indentation Signed-off-by: Enrico Usai --- tests/slurm_plugin/test_fleet_manager.py | 63 +++--------------------- 1 file changed, 7 insertions(+), 56 deletions(-) diff --git a/tests/slurm_plugin/test_fleet_manager.py b/tests/slurm_plugin/test_fleet_manager.py index d2895307a..f0fe26d3a 100644 --- a/tests/slurm_plugin/test_fleet_manager.py +++ b/tests/slurm_plugin/test_fleet_manager.py @@ -330,42 +330,14 @@ class TestEc2CreateFleetManager: } @pytest.mark.parametrize( - ( - "batch_size", - "queue", - "compute_resource", - "all_or_nothing", - "launch_overrides", - "log_assertions", - ), + ("batch_size", "queue", "compute_resource", "all_or_nothing", "launch_overrides", "log_assertions"), [ # normal - spot - ( - 5, - "queue1", - "fleet-spot", - False, - {}, - None, - ), + (5, "queue1", "fleet-spot", False, {}, None), # normal - on-demand - ( - 5, - "queue2", - "fleet-ondemand", - False, - {}, - None, - ), + (5, "queue2", "fleet-ondemand", False, {}, None), # all or nothing - ( - 5, - "queue1", - "fleet-spot", - True, - {}, - None, - ), + (5, "queue1", "fleet-spot", True, {}, None), # launch_overrides ( 5, @@ -384,32 +356,11 @@ class TestEc2CreateFleetManager: None, ), # Fleet with (Single-Subnet, Multi-InstanceType) AND all_or_nothing is True --> MinTargetCapacity is set - ( - 5, - "queue4", - "fleet1", - True, - {}, - None, - ), + (5, "queue4", "fleet1", True, {}, None), # Fleet with (Multi-Subnet, Single-InstanceType) AND all_or_nothing is True --> MinTargetCapacity is set - ( - 5, - "queue5", - "fleet1", - True, - {}, - None, - ), + (5, "queue5", "fleet1", True, {}, None), # Fleet with (Multi-Subnet, Multi-InstanceType) AND all_or_nothing is False --> NOT set MinTargetCapacity - ( - 5, - "queue6", - "fleet1", - False, - {}, - None, - ), + (5, "queue6", "fleet1", False, {}, None), # Fleet with (Multi-Subnet, Multi-InstanceType) AND all_or_nothing is True --> Log a warning ( 5, From 9eb8a1d025930ade976e0a796682178792ca6a7f Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Fri, 13 Oct 2023 17:16:12 +0200 Subject: [PATCH 03/35] Extend scontrol output parser to be able to retrieve reservation info After this patch the `_parse_nodes_info` method is able to retrieve `ReservationName` field. This info will be saved in the `reservation_name` attribute of the SlurmNode. Signed-off-by: Enrico Usai --- src/common/schedulers/slurm_commands.py | 8 ++- src/slurm_plugin/slurm_resources.py | 6 ++ .../common/schedulers/test_slurm_commands.py | 67 ++++++++++++++++++- 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/src/common/schedulers/slurm_commands.py b/src/common/schedulers/slurm_commands.py index efbe6cd8c..3e1217a0b 100644 --- a/src/common/schedulers/slurm_commands.py +++ b/src/common/schedulers/slurm_commands.py @@ -63,7 +63,7 @@ SCONTROL_OUTPUT_AWK_PARSER = ( 'awk \'BEGIN{{RS="\\n\\n" ; ORS="######\\n";}} {{print}}\' | ' + "grep -oP '^(NodeName=\\S+)|(NodeAddr=\\S+)|(NodeHostName=\\S+)|(? List[SlurmNode]: """Parse slurm node info into SlurmNode objects.""" # [ec2-user@ip-10-0-0-58 ~]$ /opt/slurm/bin/scontrol show nodes compute-dy-c5xlarge-[1-3],compute-dy-c5xlarge-50001\ # | awk 'BEGIN{{RS="\n\n" ; ORS="######\n";}} {{print}}' | grep -oP "^(NodeName=\S+)|(NodeAddr=\S+) - # |(NodeHostName=\S+)|(? List[SlurmNode]: # Partitions=compute,compute2 # SlurmdStartTime=2023-01-26T09:57:15 # Reason=some reason + # ReservationName=root_1 # ###### # NodeName=compute-dy-c5xlarge-2 # NodeAddr=1.2.3.4 @@ -430,6 +431,7 @@ def _parse_nodes_info(slurm_node_info: str) -> List[SlurmNode]: "Reason": "reason", "SlurmdStartTime": "slurmdstarttime", "LastBusyTime": "lastbusytime", + "ReservationName": "reservation_name", } date_fields = ["SlurmdStartTime", "LastBusyTime"] diff --git a/src/slurm_plugin/slurm_resources.py b/src/slurm_plugin/slurm_resources.py index abf85413f..a02e67b05 100644 --- a/src/slurm_plugin/slurm_resources.py +++ b/src/slurm_plugin/slurm_resources.py @@ -202,6 +202,7 @@ def __init__( instance=None, slurmdstarttime: datetime = None, lastbusytime: datetime = None, + reservation_name=None, ): """Initialize slurm node with attributes.""" self.name = name @@ -214,6 +215,7 @@ def __init__( self.instance = instance self.slurmdstarttime = slurmdstarttime self.lastbusytime = lastbusytime + self.reservation_name = reservation_name self.is_static_nodes_in_replacement = False self.is_being_replaced = False self._is_replacement_timeout = False @@ -441,6 +443,7 @@ def __init__( instance=None, slurmdstarttime=None, lastbusytime=None, + reservation_name=None, ): """Initialize slurm node with attributes.""" super().__init__( @@ -453,6 +456,7 @@ def __init__( instance, slurmdstarttime, lastbusytime=lastbusytime, + reservation_name=reservation_name, ) def is_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=True): @@ -558,6 +562,7 @@ def __init__( instance=None, slurmdstarttime=None, lastbusytime=None, + reservation_name=None, ): """Initialize slurm node with attributes.""" super().__init__( @@ -570,6 +575,7 @@ def __init__( instance, slurmdstarttime, lastbusytime=lastbusytime, + reservation_name=reservation_name, ) def is_state_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=True): diff --git a/tests/common/schedulers/test_slurm_commands.py b/tests/common/schedulers/test_slurm_commands.py index bcfb90247..7c2addd72 100644 --- a/tests/common/schedulers/test_slurm_commands.py +++ b/tests/common/schedulers/test_slurm_commands.py @@ -128,6 +128,49 @@ def test_is_static_node(nodename, expected_is_static): ], False, ), + ( + "NodeName=queue1-st-crt2micro-1\n" + "NodeAddr=10.0.236.182\n" + "NodeHostName=queue1-st-crt2micro-1\n" + "State=IDLE+CLOUD+MAINTENANCE+RESERVED\n" + "Partitions=queue1\n" + "SlurmdStartTime=2023-01-23T17:57:07\n" + "LastBusyTime=2023-10-13T10:13:20\n" + "ReservationName=root_1\n" + "######\n" + "NodeName=queuep4d-dy-crp4d-1\n" + "NodeAddr=queuep4d-dy-crp4d-1\n" + "NodeHostName=queuep4d-dy-crp4d-1\n" + "State=DOWN+CLOUD+MAINTENANCE+POWERED_DOWN+RESERVED\n" + "Partitions=queuep4d\n" + "SlurmdStartTime=None\n" + "LastBusyTime=Unknown\n" + "Reason=test [slurm@2023-10-20T07:18:35]\n" + "ReservationName=root_6\n" + "######\n", + [ + StaticNode( + "queue1-st-crt2micro-1", + "10.0.236.182", + "queue1-st-crt2micro-1", + "IDLE+CLOUD+MAINTENANCE+RESERVED", + "queue1", + slurmdstarttime=datetime(2023, 1, 23, 17, 57, 7).astimezone(tz=timezone.utc), + lastbusytime=datetime(2023, 10, 13, 10, 13, 20).astimezone(tz=timezone.utc), + reservation_name="root_1", + ), + DynamicNode( + "queuep4d-dy-crp4d-1", + "queuep4d-dy-crp4d-1", + "queuep4d-dy-crp4d-1", + "DOWN+CLOUD+MAINTENANCE+POWERED_DOWN+RESERVED", + "queuep4d", + reservation_name="root_6", + reason="test [slurm@2023-10-20T07:18:35]", + ), + ], + False, + ), ( "NodeName=multiple-dy-c5xlarge-3\n" "NodeAddr=multiple-dy-c5xlarge-3\n" @@ -1043,7 +1086,25 @@ def test_get_nodes_info_argument_validation( " CurrentWatts=0 AveWatts=0\n" " ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s\n" " Reason=Reboot ASAP : reboot issued [slurm@2023-01-26T10:11:40]\n" - " Comment=some comment \n" + " Comment=some comment \n\n" + "NodeName=queue1-st-crt2micro-1 Arch=x86_64 CoresPerSocket=1\n" + " CPUAlloc=0 CPUEfctv=1 CPUTot=1 CPULoad=0.00\n" + " AvailableFeatures=static,t2.micro,crt2micro\n" + " ActiveFeatures=static,t2.micro,crt2micro\n" + " Gres=(null)\n" + " NodeAddr=10.0.236.182 NodeHostName=queue1-st-crt2micro-1 Version=23.02.4\n" + " OS=Linux 5.10.186-179.751.amzn2.x86_64 #1 SMP Tue Aug 1 20:51:38 UTC 2023\n" + " RealMemory=972 AllocMem=0 FreeMem=184 Sockets=1 Boards=1\n" + " State=IDLE+CLOUD+MAINTENANCE+RESERVED ThreadsPerCore=1 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A\n" + " Partitions=queue1\n" + " BootTime=2023-10-13T10:09:58 SlurmdStartTime=2023-10-13T10:13:17\n" + " LastBusyTime=2023-10-13T10:13:20 ResumeAfterTime=None\n" + " CfgTRES=cpu=1,mem=972M,billing=1\n" + " AllocTRES=\n" + " CapWatts=n/a\n" + " CurrentWatts=0 AveWatts=0\n" + " ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s\n" + " ReservationName=root_5\n" ), ( "NodeName=queue1-st-compute-resource-1-1\nNodeAddr=192.168.123.191\n" @@ -1056,6 +1117,10 @@ def test_get_nodes_info_argument_validation( "SlurmdStartTime=2023-01-26T09:57:16\n" "LastBusyTime=Unknown\n" "Reason=Reboot ASAP : reboot issued [slurm@2023-01-26T10:11:40]\n######\n" + "NodeName=queue1-st-crt2micro-1\nNodeAddr=10.0.236.182\n" + "NodeHostName=queue1-st-crt2micro-1\nState=IDLE+CLOUD+MAINTENANCE+RESERVED\nPartitions=queue1\n" + "SlurmdStartTime=2023-10-13T10:13:17\n" + "LastBusyTime=2023-10-13T10:13:20\nReservationName=root_5######\n" ), ) ], From 54d2cf1189164f8ca4b0d087a1e0eca9cff4afe1 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 16 Oct 2023 10:32:57 +0200 Subject: [PATCH 04/35] Add commands to manage slurm reservation Add commands to * create a reservation * Use slurm as default user rather than root, given that "slurm" is the admin in ParallelCluster setup. * update an existing reservation * I put in common steps to populate create and update commands since they are very similar. * check if reservation exists * delete_reservation Signed-off-by: Enrico Usai --- src/common/schedulers/slurm_commands.py | 1 + .../schedulers/slurm_reservation_commands.py | 188 +++++++++++ .../test_slurm_reservation_commands.py | 292 ++++++++++++++++++ 3 files changed, 481 insertions(+) create mode 100644 src/common/schedulers/slurm_reservation_commands.py create mode 100644 tests/common/schedulers/test_slurm_reservation_commands.py diff --git a/src/common/schedulers/slurm_commands.py b/src/common/schedulers/slurm_commands.py index 3e1217a0b..e4c178d90 100644 --- a/src/common/schedulers/slurm_commands.py +++ b/src/common/schedulers/slurm_commands.py @@ -68,6 +68,7 @@ # Set default timeouts for running different slurm commands. # These timeouts might be needed when running on large scale +DEFAULT_SCONTROL_COMMAND_TIMEOUT = 30 DEFAULT_GET_INFO_COMMAND_TIMEOUT = 30 DEFAULT_UPDATE_COMMAND_TIMEOUT = 60 diff --git a/src/common/schedulers/slurm_reservation_commands.py b/src/common/schedulers/slurm_reservation_commands.py new file mode 100644 index 000000000..c02187604 --- /dev/null +++ b/src/common/schedulers/slurm_reservation_commands.py @@ -0,0 +1,188 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import logging + +# A nosec comment is appended to the following line in order to disable the B404 check. +# In this file the input of the module subprocess is trusted. +import subprocess # nosec B404 +from datetime import datetime + +from common.schedulers.slurm_commands import DEFAULT_SCONTROL_COMMAND_TIMEOUT, SCONTROL +from common.utils import run_command, validate_subprocess_argument + +logger = logging.getLogger(__name__) + + +def _create_or_update_reservation( + base_command: str, + name: str, + nodes: str = None, + partition: str = None, + user: str = None, + start_time: datetime = None, + duration: int = None, + number_of_nodes: int = None, + flags: str = None, + command_timeout=DEFAULT_SCONTROL_COMMAND_TIMEOUT, + raise_on_error=True, +): + """ + Create or update slurm reservation, adding all the parameters. + + Official documentation is https://slurm.schedmd.com/reservations.html + """ + cmd = _add_param(base_command, "ReservationName", name) + cmd = _add_param(cmd, "nodes", nodes) + cmd = _add_param(cmd, "partition", partition) + cmd = _add_param(cmd, "user", user) + if isinstance(start_time, datetime): + # Convert start time to format accepted by slurm command + cmd = _add_param(cmd, "starttime", start_time.strftime("%Y-%m-%dT%H:%M:%S")) + cmd = _add_param(cmd, "duration", duration) + cmd = _add_param(cmd, "nodecnt", number_of_nodes) + cmd = _add_param(cmd, "flags", flags) + + run_command(cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True) # nosec B604 + + +def create_slurm_reservation( + name: str, + nodes: str = "ALL", + partition: str = None, + user: str = "slurm", + start_time: datetime = None, + duration: int = None, + number_of_nodes: int = None, + flags: str = "maint", + command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT, + raise_on_error: bool = True, +): + """ + Create slurm reservation with scontrol call. + + The command to create a reservation is something like the following: + scontrol create reservation starttime=2009-02-06T16:00:00 duration=120 user=slurm flags=maint nodes=ALL + scontrol create reservation user=root partition=queue1 starttime=noon duration=60 nodecnt=10 + + We're using slurm as default user because it is the default Slurm administrator user in ParallelCluster, + this is the user running slurmctld daemon. + "maint" flag permits to overlap existing reservations. + Official documentation is https://slurm.schedmd.com/reservations.html + """ + cmd = f"{SCONTROL} create reservation" + + logger.info("Creating Slurm reservation with command: %s", cmd) + _create_or_update_reservation( + cmd, name, nodes, partition, user, start_time, duration, number_of_nodes, flags, command_timeout, raise_on_error + ) + + +def update_slurm_reservation( + name: str, + nodes: str = None, + partition: str = None, + user: str = None, + start_time: datetime = None, + duration: int = None, + number_of_nodes: int = None, + flags: str = None, + command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT, + raise_on_error: bool = True, +): + """ + Update slurm reservation with scontrol call. + + The command to update a reservation is something like the following: + scontrol update ReservationName=root_3 duration=150 users=admin + + Official documentation is https://slurm.schedmd.com/reservations.html + """ + cmd = f"{SCONTROL} update" + + logger.info("Updating Slurm reservation with command: %s", cmd) + _create_or_update_reservation( + cmd, name, nodes, partition, user, start_time, duration, number_of_nodes, flags, command_timeout, raise_on_error + ) + + +def delete_slurm_reservation( + name: str, + command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT, + raise_on_error: bool = True, +): + """ + Delete slurm reservation with scontrol call. + + The command to delete a reservation is something like the following: + scontrol delete ReservationName=root_6 + + Official documentation is https://slurm.schedmd.com/reservations.html + """ + cmd = f"{SCONTROL} delete reservation" + cmd = _add_param(cmd, "ReservationName", name) + + logger.info("Deleting Slurm reservation with command: %s", cmd) + run_command(cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True) # nosec B604 + + +def _add_param(cmd, param_name, value): + """If the given value is not None, validate it and concatenate ' param_name=value' to cmd.""" + if value: + validate_subprocess_argument(value) + cmd += f" {param_name}={value}" + return cmd + + +def does_slurm_reservation_exist( + name: str, + command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT, + raise_on_error: bool = True, +): + """ + Check if slurm reservation exists, by retrieving information with scontrol call. + + Return True if reservation exists, False otherwise. + Raise a CalledProcessError if the command fails for other reasons. + + $ scontrol show ReservationName=root_5 + Reservation root_5 not found + $ echo $? + 1 + + $ scontrol show ReservationName=root_6 + ReservationName=root_6 StartTime=2023-10-13T15:57:03 EndTime=2024-10-12T15:57:03 Duration=365-00:00:00 + Nodes=q1-st-cr2-1,q2-dy-cr4-[1-5] NodeCnt=6 CoreCnt=481 Features=(null) PartitionName=(null) Flags=MAINT,SPEC_NODES + TRES=cpu=481 + Users=root Groups=(null) Accounts=(null) Licenses=(null) State=ACTIVE BurstBuffer=(null) Watts=n/a + MaxStartDelay=(null) + $ echo $? + 0 + + Official documentation is https://slurm.schedmd.com/reservations.html + """ + cmd = f"{SCONTROL} show" + cmd = _add_param(cmd, "ReservationName", name) + + try: + logger.debug("Retrieving Slurm reservation with command: %s", cmd) + run_command(cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True) # nosec B604 + reservation_exists = True + + except subprocess.CalledProcessError as e: + output = e.stdout.rstrip() + if output == f"Reservation {name} not found": + logger.info(f"Slurm reservation {name} not found.") + reservation_exists = False + else: + logger.error("Failed when retrieving Slurm reservation info with command %s. Error: %s", cmd, output) + raise e + + return reservation_exists diff --git a/tests/common/schedulers/test_slurm_reservation_commands.py b/tests/common/schedulers/test_slurm_reservation_commands.py new file mode 100644 index 000000000..abb1def33 --- /dev/null +++ b/tests/common/schedulers/test_slurm_reservation_commands.py @@ -0,0 +1,292 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +from datetime import datetime + +import pytest +from assertpy import assert_that +from common.schedulers.slurm_commands import DEFAULT_SCONTROL_COMMAND_TIMEOUT, SCONTROL +from common.schedulers.slurm_reservation_commands import ( + _add_param, + _create_or_update_reservation, + create_slurm_reservation, + delete_slurm_reservation, + update_slurm_reservation, +) + + +@pytest.mark.parametrize( + ( + "base_cmd, name, nodes, partition, user, start_time, duration, " + "number_of_nodes, flags, raise_on_error, timeout, expected_cmd" + ), + [ + ( + f"{SCONTROL} create reservation", + "root_1", + None, + None, + None, + None, + None, + None, + None, + None, + None, + f"{SCONTROL} create reservation ReservationName=root_1", + ), + ( + f"{SCONTROL} create reservation", + "root_1", + None, + None, + None, + None, + None, + None, + None, + None, + None, + f"{SCONTROL} create reservation ReservationName=root_1", + ), + ( + f"{SCONTROL} update", + "root_2", + "nodes-1,nodes[2-6]", + "queue1", + "user1", + datetime(2023, 1, 23, 17, 57, 7), + 180, + 10, + "testflag", + True, + 10, + ( + f"{SCONTROL} update ReservationName=root_2 nodes=nodes-1,nodes[2-6] partition=queue1" + " user=user1 starttime=2023-01-23T17:57:07 duration=180 nodecnt=10 flags=testflag" + ), + ), + ], +) +def test_create_or_update_reservation( + base_cmd, + name, + nodes, + partition, + user, + start_time, + duration, + number_of_nodes, + flags, + expected_cmd, + raise_on_error, + timeout, + mocker, +): + run_cmd_mock = mocker.patch("common.schedulers.slurm_reservation_commands.run_command") + kwargs = {"name": name} + + if nodes: + kwargs.update({"nodes": nodes}) + if partition: + kwargs.update({"partition": partition}) + if user: + kwargs.update({"user": user}) + if start_time: + kwargs.update({"start_time": start_time}) + if duration: + kwargs.update({"duration": duration}) + if number_of_nodes: + kwargs.update({"number_of_nodes": number_of_nodes}) + if flags: + kwargs.update({"flags": flags}) + + raise_on_error = raise_on_error is True + kwargs.update({"raise_on_error": raise_on_error}) + command_timeout = timeout if timeout else DEFAULT_SCONTROL_COMMAND_TIMEOUT + kwargs.update({"command_timeout": command_timeout}) + + _create_or_update_reservation(base_cmd, **kwargs) + run_cmd_mock.assert_called_with(expected_cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True) + + +@pytest.mark.parametrize( + "name, nodes, partition, user, start_time, duration, number_of_nodes, flags", + [ + ( + "root_1", + None, + None, + None, + None, + None, + None, + None, + ), + ( + "root_2", + "nodes-1,nodes[2-6]", + "queue1", + "user1", + datetime(2023, 1, 23, 17, 57, 7), + 180, + 10, + "testflag", + ), + ], +) +def test_create_slurm_reservation( + name, + nodes, + partition, + user, + start_time, + duration, + number_of_nodes, + flags, + mocker, +): + run_cmd_mock = mocker.patch("common.schedulers.slurm_reservation_commands._create_or_update_reservation") + + # Compose command to avoid passing None values + kwargs = {"name": name} + if nodes: + kwargs.update({"nodes": nodes}) + if partition: + kwargs.update({"partition": partition}) + if user: + kwargs.update({"user": user}) + if start_time: + kwargs.update({"start_time": start_time}) + if duration: + kwargs.update({"duration": duration}) + if number_of_nodes: + kwargs.update({"number_of_nodes": number_of_nodes}) + if flags: + kwargs.update({"flags": flags}) + + create_slurm_reservation(**kwargs) + + # check expected internal call + nodes = nodes if nodes else "ALL" + user = user if user else "slurm" + flags = flags if flags else "maint" + run_cmd_mock.assert_called_with( + f"{SCONTROL} create reservation", + name, + nodes, + partition, + user, + start_time, + duration, + number_of_nodes, + flags, + DEFAULT_SCONTROL_COMMAND_TIMEOUT, + True, + ) + + +@pytest.mark.parametrize( + "name, nodes, partition, user, start_time, duration, number_of_nodes, flags", + [ + ( + "root_1", + None, + None, + None, + None, + None, + None, + None, + ), + ( + "root_2", + "nodes-1,nodes[2-6]", + "queue1", + "user1", + datetime(2023, 1, 23, 17, 57, 7), + 180, + 10, + "testflag", + ), + ], +) +def test_update_slurm_reservation( + name, + nodes, + partition, + user, + start_time, + duration, + number_of_nodes, + flags, + mocker, +): + run_cmd_mock = mocker.patch("common.schedulers.slurm_reservation_commands._create_or_update_reservation") + + # Compose command to avoid passing None values + kwargs = {"name": name} + if nodes: + kwargs.update({"nodes": nodes}) + if partition: + kwargs.update({"partition": partition}) + if user: + kwargs.update({"user": user}) + if start_time: + kwargs.update({"start_time": start_time}) + if duration: + kwargs.update({"duration": duration}) + if number_of_nodes: + kwargs.update({"number_of_nodes": number_of_nodes}) + if flags: + kwargs.update({"flags": flags}) + + update_slurm_reservation(**kwargs) + + # check expected internal call + run_cmd_mock.assert_called_with( + f"{SCONTROL} update", + name, + nodes, + partition, + user, + start_time, + duration, + number_of_nodes, + flags, + DEFAULT_SCONTROL_COMMAND_TIMEOUT, + True, + ) + + +@pytest.mark.parametrize( + "name, cmd_call_kwargs", + [ + ("root_1", {"name": "root_1"}), + ], +) +def test_delete_reservation(name, cmd_call_kwargs, mocker): + run_cmd_mock = mocker.patch("common.schedulers.slurm_reservation_commands.run_command") + delete_slurm_reservation(name) + + cmd = f"{SCONTROL} delete reservation ReservationName={name}" + run_cmd_mock.assert_called_with(cmd, raise_on_error=True, timeout=DEFAULT_SCONTROL_COMMAND_TIMEOUT, shell=True) + + +@pytest.mark.parametrize( + "cmd, param_name, value, expected_cmd", + [ + ("cmd", "", "", "cmd"), + ("cmd", "param", None, "cmd"), + ("cmd", "param", "value", "cmd param=value"), + ], +) +def test_add_param(cmd, param_name, value, expected_cmd): + assert_that(_add_param(cmd, param_name, value)).is_equal_to(expected_cmd) From 55be6fa0eba0f3e1ef71d168c6487aa8a98a5d5b Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 25 Oct 2023 12:53:46 +0200 Subject: [PATCH 05/35] Add command to get Slurm reservations info using scontrol Defined a new dataclass to store Slurm reservation info. Signed-off-by: Enrico Usai --- .../schedulers/slurm_reservation_commands.py | 72 +++++++++++- src/slurm_plugin/slurm_resources.py | 8 ++ .../test_slurm_reservation_commands.py | 107 ++++++++++++++---- 3 files changed, 161 insertions(+), 26 deletions(-) diff --git a/src/common/schedulers/slurm_reservation_commands.py b/src/common/schedulers/slurm_reservation_commands.py index c02187604..b8eb3fc46 100644 --- a/src/common/schedulers/slurm_reservation_commands.py +++ b/src/common/schedulers/slurm_reservation_commands.py @@ -14,13 +14,21 @@ # In this file the input of the module subprocess is trusted. import subprocess # nosec B404 from datetime import datetime +from typing import List from common.schedulers.slurm_commands import DEFAULT_SCONTROL_COMMAND_TIMEOUT, SCONTROL -from common.utils import run_command, validate_subprocess_argument +from common.utils import check_command_output, run_command, validate_subprocess_argument +from slurm_plugin.slurm_resources import SlurmReservation logger = logging.getLogger(__name__) +SCONTROL_SHOW_RESERVATION_OUTPUT_AWK_PARSER = ( + 'awk \'BEGIN{{RS="\\n\\n" ; ORS="######\\n";}} {{print}}\' | ' + + "grep -oP '^(ReservationName=\\S+)|(? List[SlurmReservation]: + """ + List existing slurm reservations with scontrol call. + + The output of the command is something like the following: + $ scontrol show reservations + ReservationName=root_7 StartTime=2023-10-25T09:46:49 EndTime=2024-10-24T09:46:49 Duration=365-00:00:00 + Nodes=queuep4d-dy-crp4d-[1-5] NodeCnt=5 CoreCnt=480 Features=(null) PartitionName=(null) Flags=MAINT,SPEC_NODES + TRES=cpu=480 + Users=root Groups=(null) Accounts=(null) Licenses=(null) State=ACTIVE BurstBuffer=(null) Watts=n/a + MaxStartDelay=(null) + + Official documentation is https://slurm.schedmd.com/reservations.html + """ + # awk is used to replace the \n\n record separator with '######\n' + show_reservations_command = f"{SCONTROL} show reservations | {SCONTROL_SHOW_RESERVATION_OUTPUT_AWK_PARSER}" + slurm_reservations_info = check_command_output( + show_reservations_command, raise_on_error=raise_on_error, timeout=command_timeout, shell=True + ) # nosec B604 + + return _parse_reservations_info(slurm_reservations_info) + + +def _parse_reservations_info(slurm_reservations_info: str) -> List[SlurmReservation]: + """Parse slurm reservations info into SlurmReservation objects.""" + # $ /opt/slurm/bin/scontrol show reservations awk 'BEGIN{{RS="\n\n" ; ORS="######\n";}} {{print}}' | + # grep -oP '^(ReservationName=\S+)|(? Date: Mon, 16 Oct 2023 10:45:41 +0200 Subject: [PATCH 06/35] Add SlurmNode methods to check reservation state Added: * _is_reserved -> when there is RESERVED state * is_in_maintenance -> when there are both RESERVED and MAINTENANCE states. ### Tests * Added unit tests for the new methods. Signed-off-by: Enrico Usai --- src/slurm_plugin/slurm_resources.py | 11 ++++ .../slurm_resources/test_slurm_resources.py | 59 +++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/src/slurm_plugin/slurm_resources.py b/src/slurm_plugin/slurm_resources.py index 80ef3b66b..8f3fca5d1 100644 --- a/src/slurm_plugin/slurm_resources.py +++ b/src/slurm_plugin/slurm_resources.py @@ -187,6 +187,9 @@ class SlurmNode(metaclass=ABCMeta): SLURM_SCONTROL_REBOOT_REQUESTED_STATE = "REBOOT_REQUESTED" SLURM_SCONTROL_REBOOT_ISSUED_STATE = "REBOOT_ISSUED" SLURM_SCONTROL_INVALID_REGISTRATION_STATE = "INVALID_REG" + # Reservation related state + SLURM_SCONTROL_RESERVED_STATE = "RESERVED" + SLURM_SCONTROL_MAINTENANCE_STATE = "MAINTENANCE" SLURM_SCONTROL_NODE_DOWN_NOT_RESPONDING_REASON = re.compile(r"Not responding \[slurm@.+\]") @@ -295,6 +298,14 @@ def is_up(self): """Check if slurm node is in a healthy state.""" return not self._is_drain() and not self.is_down() and not self.is_powering_down() + def _is_reserved(self): + """Check if slurm node is reserved.""" + return self.SLURM_SCONTROL_RESERVED_STATE in self.states + + def is_in_maintenance(self): + """Check if slurm node is reserved and in maintenance.""" + return self.SLURM_SCONTROL_MAINTENANCE_STATE in self.states and self._is_reserved() + def is_powering_up(self): """Check if slurm node is in powering up state.""" return self.SLURM_SCONTROL_POWER_UP_STATE in self.states diff --git a/tests/slurm_plugin/slurm_resources/test_slurm_resources.py b/tests/slurm_plugin/slurm_resources/test_slurm_resources.py index db68efc67..89b69b225 100644 --- a/tests/slurm_plugin/slurm_resources/test_slurm_resources.py +++ b/tests/slurm_plugin/slurm_resources/test_slurm_resources.py @@ -287,6 +287,65 @@ def test_slurm_node_is_up(node, expected_output): assert_that(node.is_up()).is_equal_to(expected_output) +@pytest.mark.parametrize( + "node, expected_output", + [ + (StaticNode("queue1-st-c5xlarge-1", "nodeip", "nodehostname", "IDLE+CLOUD+POWER", "queue1"), False), + ( + StaticNode("queue1-st-c5xlarge-1", "nodeip", "nodehostname", "IDLE+CLOUD+MAINTENANCE+RESERVED", "queue1"), + True, + ), + ( + StaticNode( + "queue1-st-c5xlarge-1", + "nodeip", + "nodehostname", + "DOWN+CLOUD+MAINTENANCE+POWERED_DOWN+RESERVED", + "queue1", + ), + True, + ), + ( + DynamicNode("queue1-dy-c5xlarge-1", "nodeip", "nodehostname", "IDLE+CLOUD+RESERVED", "queue1"), + True, + ), + (DynamicNode("queue1-dy-c5xlarge-1", "nodeip", "nodehostname", "RESERVED", "queue1"), True), + ], +) +def test_slurm_node_is_reserved(node, expected_output): + assert_that(node._is_reserved()).is_equal_to(expected_output) + + +@pytest.mark.parametrize( + "node, expected_output", + [ + (StaticNode("queue1-st-c5xlarge-1", "nodeip", "nodehostname", "IDLE+CLOUD+POWER", "queue1"), False), + ( + StaticNode("queue1-st-c5xlarge-1", "nodeip", "nodehostname", "IDLE+CLOUD+MAINTENANCE+RESERVED", "queue1"), + True, + ), + ( + StaticNode( + "queue1-st-c5xlarge-1", + "nodeip", + "nodehostname", + "DOWN+CLOUD+MAINTENANCE+POWERED_DOWN+RESERVED", + "queue1", + ), + True, + ), + ( + DynamicNode("queue1-dy-c5xlarge-1", "nodeip", "nodehostname", "IDLE+CLOUD+MAINTENANCE", "queue1"), + False, # RESERVED is required as well + ), + (DynamicNode("queue1-dy-c5xlarge-1", "nodeip", "nodehostname", "MAINTENANCE", "queue1"), False), + (DynamicNode("queue1-dy-c5xlarge-1", "nodeip", "nodehostname", "MAINTENANCE+RESERVED", "queue1"), True), + ], +) +def test_slurm_node_is_in_maintenance(node, expected_output): + assert_that(node.is_in_maintenance()).is_equal_to(expected_output) + + @pytest.mark.parametrize( "node, expected_output", [ From f458d34c43e6cbdcd258c3bf356a6afa12669153 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 16 Oct 2023 11:36:06 +0200 Subject: [PATCH 07/35] Exclude nodes in maintenance from static nodes to be replaced After this patch the nodes in `MAINTENANCE+RESERVED` state will be excluded by the static nodes in replacement list. Will see in the "replacement list" only nodes that are not yet up (as before) and nodes that are not in maintenance. This mechanism permits the daemons (or the user) to avoid replacement of turned down static nodes (`DOWN`) by putting them in maintenance as preliminary step. This works only for nodes in both `MAINTENANCE` and `RESERVED` state, nodes only in `RESERVED` state will be replaced as before. In the future, we can extend this mechanism to have an additional field to check (e.g. skip them only if the maintenance is set by the `root` user). Signed-off-by: Enrico Usai --- src/slurm_plugin/clustermgtd.py | 14 ++++++++----- tests/slurm_plugin/test_clustermgtd.py | 29 +++++++++++++++++++++----- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py index 3aa67abe5..333c0b881 100644 --- a/src/slurm_plugin/clustermgtd.py +++ b/src/slurm_plugin/clustermgtd.py @@ -696,16 +696,19 @@ def _is_instance_unhealthy(self, instance_status: EC2InstanceHealthState, health ) return is_instance_status_unhealthy - def _update_static_nodes_in_replacement(self, slurm_nodes): - """Update self.static_nodes_in_replacement by removing nodes that finished replacement and is up.""" + def _update_static_nodes_in_replacement(self, slurm_nodes: List[SlurmNode]): + """Remove from self.static_nodes_in_replacement nodes that are up or that are in maintenance.""" nodename_to_slurm_nodes_map = {node.name: node for node in slurm_nodes} nodes_still_in_replacement = set() for nodename in self._static_nodes_in_replacement: node = nodename_to_slurm_nodes_map.get(nodename) - # Remove nodename from static_nodes_in_replacement if node is no longer an active node or node is up - if node and not node.is_up(): + # Consider nodename still in replacement if node is not up and not in maintenance + if node and not node.is_up() and not node.is_in_maintenance(): nodes_still_in_replacement.add(nodename) + + # override self._static_nodes_in_replacement with updated list self._static_nodes_in_replacement = nodes_still_in_replacement + # update node attributes for node in slurm_nodes: node.is_static_nodes_in_replacement = node.name in self._static_nodes_in_replacement node.is_being_replaced = self._is_node_being_replaced(node) @@ -864,7 +867,8 @@ def _maintain_nodes(self, partitions_name_map, compute_resource_nodes_map): """ log.info("Performing node maintenance actions") active_nodes = self._find_active_nodes(partitions_name_map) - # Update self.static_nodes_in_replacement by removing any up nodes from the set + + # Update self.static_nodes_in_replacement by removing from the set any node that is up or in maintenance self._update_static_nodes_in_replacement(active_nodes) log.info( "Following nodes are currently in replacement: %s", print_with_count(self._static_nodes_in_replacement) diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py index aed08a6c0..1cc45f297 100644 --- a/tests/slurm_plugin/test_clustermgtd.py +++ b/tests/slurm_plugin/test_clustermgtd.py @@ -733,13 +733,32 @@ def test_handle_health_check( "current_replacing_nodes, slurm_nodes, expected_replacing_nodes", [ ( - {"queue1-st-c5xlarge-1", "queue1-st-c5xlarge-2", "queue1-st-c5xlarge-4"}, + { + "queue1-st-c5xlarge-1", + "queue1-st-c5xlarge-2", + "queue1-st-c5xlarge-3", + "queue1-st-c5xlarge-4", + "queue1-st-c5xlarge-5", + "queue1-st-c5xlarge-6", + "queue1-st-c5xlarge-13", + }, [ + # nodes in the current replacement list StaticNode("queue1-st-c5xlarge-1", "ip", "hostname", "IDLE+CLOUD", "queue1"), StaticNode("queue1-st-c5xlarge-2", "ip", "hostname", "DOWN+CLOUD", "queue1"), - StaticNode("queue1-st-c5xlarge-3", "ip", "hostname", "IDLE+CLOUD", "queue1"), - ], - {"queue1-st-c5xlarge-2"}, + StaticNode("queue1-st-c5xlarge-4", "ip", "hostname", "IDLE+CLOUD+MAINTENANCE+RESERVED", "queue1"), + StaticNode("queue1-st-c5xlarge-3", "ip", "hostname", "DOWN+CLOUD+MAINTENANCE+RESERVED", "queue1"), + StaticNode("queue1-st-c5xlarge-5", "ip", "hostname", "IDLE+CLOUD+RESERVED", "queue1"), + StaticNode("queue1-st-c5xlarge-6", "ip", "hostname", "DOWN+CLOUD+RESERVED", "queue1"), + # nodes not in replacement list + StaticNode("queue1-st-c5xlarge-8", "ip", "hostname", "IDLE+CLOUD+MAINTENANCE+RESERVED", "queue1"), + StaticNode("queue1-st-c5xlarge-7", "ip", "hostname", "DOWN+CLOUD+MAINTENANCE+RESERVED", "queue1"), + StaticNode("queue1-st-c5xlarge-9", "ip", "hostname", "IDLE+CLOUD+RESERVED", "queue1"), + StaticNode("queue1-st-c5xlarge-10", "ip", "hostname", "DOWN+CLOUD+RESERVED", "queue1"), + StaticNode("queue1-st-c5xlarge-11", "ip", "hostname", "IDLE+CLOUD", "queue1"), + StaticNode("queue1-st-c5xlarge-12", "ip", "hostname", "DOWN+CLOUD", "queue1"), + ], + {"queue1-st-c5xlarge-2", "queue1-st-c5xlarge-6"}, ) ], ids=["mixed"], @@ -747,7 +766,7 @@ def test_handle_health_check( @pytest.mark.usefixtures( "initialize_instance_manager_mock", "initialize_executor_mock", "initialize_console_logger_mock" ) -def test_update_static_nodes_in_replacement(current_replacing_nodes, slurm_nodes, expected_replacing_nodes, mocker): +def test_update_static_nodes_in_replacement(current_replacing_nodes, slurm_nodes, expected_replacing_nodes): mock_sync_config = SimpleNamespace( insufficient_capacity_timeout=600, cluster_name="cluster", From dbc7224b2a85414615cd414713ef788f8fa090a9 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 18 Oct 2023 11:52:45 +0200 Subject: [PATCH 08/35] Add boto3 layer This code is taken from the CLI, removing the caching mechanism. Signed-off-by: Enrico Usai --- src/aws/__init__.py | 10 +++ src/aws/common.py | 156 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 src/aws/__init__.py create mode 100644 src/aws/common.py diff --git a/src/aws/__init__.py b/src/aws/__init__.py new file mode 100644 index 000000000..a2d2da929 --- /dev/null +++ b/src/aws/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. diff --git a/src/aws/common.py b/src/aws/common.py new file mode 100644 index 000000000..856eccf4c --- /dev/null +++ b/src/aws/common.py @@ -0,0 +1,156 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import logging +import time +from enum import Enum + +import boto3 +from botocore.config import Config +from botocore.exceptions import BotoCoreError, ClientError, ParamValidationError + +LOGGER = logging.getLogger(__name__) + + +class AWSClientError(Exception): + """Error during execution of some AWS calls.""" + + class ErrorCode(Enum): + """Error codes for AWS ClientError.""" + + VALIDATION_ERROR = "ValidationError" + REQUEST_LIMIT_EXCEEDED = "RequestLimitExceeded" + THROTTLING_EXCEPTION = "ThrottlingException" + CONDITIONAL_CHECK_FAILED_EXCEPTION = "ConditionalCheckFailedException" + + @classmethod + def throttling_error_codes(cls): + """Return a set of error codes returned when service rate limits are exceeded.""" + return {cls.REQUEST_LIMIT_EXCEEDED.value, cls.THROTTLING_EXCEPTION.value} + + def __init__(self, function_name: str, message: str, error_code: str = None): + super().__init__(message) + self.message = message + self.error_code = error_code + self.function_name = function_name + + +class LimitExceededError(AWSClientError): + """Error caused by exceeding the limits of a downstream AWS service.""" + + def __init__(self, function_name: str, message: str, error_code: str = None): + super().__init__(function_name=function_name, message=message, error_code=error_code) + + +class BadRequestError(AWSClientError): + """Error caused by a problem in the request.""" + + def __init__(self, function_name: str, message: str, error_code: str = None): + super().__init__(function_name=function_name, message=message, error_code=error_code) + + +class AWSExceptionHandler: + """AWS Exception handler.""" + + @staticmethod + def handle_client_exception(func): + """Handle Boto3 errors, can be used as a decorator.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except ParamValidationError as validation_error: + error = BadRequestError( + func.__name__, + "Error validating parameter. Failed with exception: {0}".format(str(validation_error)), + ) + except BotoCoreError as e: + error = AWSClientError(func.__name__, str(e)) + except ClientError as e: + # add request id + message = e.response["Error"]["Message"] + error_code = e.response["Error"]["Code"] + + if error_code in AWSClientError.ErrorCode.throttling_error_codes(): + error = LimitExceededError(func.__name__, message, error_code) + elif error_code == AWSClientError.ErrorCode.VALIDATION_ERROR: + error = BadRequestError(func.__name__, message, error_code) + else: + error = AWSClientError(func.__name__, message, error_code) + LOGGER.error("Encountered error when performing boto3 call in %s: %s", error.function_name, error.message) + raise error + + return wrapper + + @staticmethod + def retry_on_boto3_throttling(func): + """Retry boto3 calls on throttling, can be used as a decorator.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + while True: + try: + return func(*args, **kwargs) + except ClientError as e: + if e.response["Error"]["Code"] != "Throttling": + raise + LOGGER.debug("Throttling when calling %s function. Will retry in %d seconds.", func.__name__, 5) + time.sleep(5) + + return wrapper + + +def _log_boto3_calls(params, **kwargs): + service = kwargs["event_name"].split(".")[-2] + operation = kwargs["event_name"].split(".")[-1] + region = kwargs["context"].get("client_region", boto3.session.Session().region_name) + LOGGER.info( + "Executing boto3 call: region=%s, service=%s, operation=%s, params=%s", region, service, operation, params + ) + + +class Boto3Client: + """Boto3 client Class.""" + + def __init__(self, client_name: str, config: Config): + self._client = boto3.client(client_name, config=config if config else None) + self._client.meta.events.register("provide-client-params.*.*", _log_boto3_calls) + + def _paginate_results(self, method, **kwargs): + """ + Return a generator for a boto3 call, this allows pagination over an arbitrary number of responses. + + :param method: boto3 method + :param kwargs: arguments to method + :return: generator with boto3 results + """ + paginator = self._client.get_paginator(method.__name__) + for page in paginator.paginate(**kwargs).result_key_iters(): + for result in page: + yield result + + +class Boto3Resource: + """Boto3 resource Class.""" + + def __init__(self, resource_name: str): + self._resource = boto3.resource(resource_name) + self._resource.meta.client.meta.events.register("provide-client-params.*.*", _log_boto3_calls) + + +def get_region(): + """Get region used internally for all the AWS calls.""" + region = boto3.session.Session().region_name + if region is None: + raise AWSClientError("get_region", "AWS region not configured") + return region From b02d9cf04fe947c367b98518f1511cbb696abdcf Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 18 Oct 2023 13:13:51 +0200 Subject: [PATCH 09/35] Introduce CapacityBlockManager to handle instances part of a Capacity Block reservation ### Capacity Block reservation Capacity Block reservation for ML (CB) allow AWS users to reserve blocks of GPU capacity for a future start date and for a time duration of their choice. They are capacity reservations of type `capacity-block`, with additional attributes like: `StartDate`, `EndDate` and additional states. Nodes part of the CB cannot be used until the CB time started. Jobs assigned to these nodes will be kept in pending until the CB is active, to achieve this we're going to use Slurm reservation, in particular we'll set the nodes in maintenance state when CB is not yet active or when CB time is expired. ### CapacityBlockManager CapacityBlockManager is a new class that will perform the following actions: * when CB is not active or expired, a new Slurm reservation will be created/updated * when CB is active, Slurm reservation will be removed and nodes will become standard static instances Slurm reservation name will be "pcluster-{capacity_block_reservation_id}" CapacityBlockManager will be initialized by clustermgtd with region, boto3_config and fleet_config info. Fleet config is used to retrieve capacity reservation ids and type (ondemand vs capacity-block). The manager will reload fleet-config info and capacity reservation info every time the daemon is restarted or when the config is modified. The logic updates capacity block reservation info every 10 minutes. The manager will remove the nodes associated with capacity-block reservations that are not active from the list of unhealty static nodenames. ### CapacityBlock CapacityBlock is a new class to store internal info about the capacity block, merging them from ec2 (e.g. capacity block state) and from the config (i.e. list of nodes associated to it) ### Leftover slurm reservations When a CB is removed from a cluster config during a cluster update we need to remove related slurm reservations. We're checking slurm reservations for the given nodes and deleting them when are managed by CapacityBlockManager but no longer in the config. Signed-off-by: Enrico Usai --- CHANGELOG.md | 1 + src/aws/ec2.py | 110 ++++ src/common/time_utils.py | 5 + src/slurm_plugin/capacity_block_manager.py | 326 ++++++++++++ src/slurm_plugin/clustermgtd.py | 23 +- src/slurm_plugin/slurm_resources.py | 2 +- tests/common/test_time_utils.py | 18 + .../test_capacity_block_manager.py | 469 ++++++++++++++++++ tests/slurm_plugin/test_clustermgtd.py | 87 +++- 9 files changed, 1035 insertions(+), 6 deletions(-) create mode 100644 src/aws/ec2.py create mode 100644 src/slurm_plugin/capacity_block_manager.py create mode 100644 tests/common/test_time_utils.py create mode 100644 tests/slurm_plugin/test_capacity_block_manager.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e1961d6c..76415752d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ This file is used to list changes made in each version of the aws-parallelcluste ------ **ENHANCEMENTS** +- Add support for EC2 Capacity Blocks for ML. **CHANGES** - Perform job-level scaling by default for all jobs, using information in the `SLURM_RESUME_FILE`. Job-level scaling diff --git a/src/aws/ec2.py b/src/aws/ec2.py new file mode 100644 index 000000000..ce15ed97d --- /dev/null +++ b/src/aws/ec2.py @@ -0,0 +1,110 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +from typing import List + +from aws.common import AWSExceptionHandler, Boto3Client + + +class CapacityReservationInfo: + """ + Data object wrapping the result of a describe-capacity-reservations call. + + { + "CapacityReservationId": "cr-abcdEXAMPLE9876ef ", + "EndDateType": "unlimited", + "AvailabilityZone": "eu-west-1a", + "InstanceMatchCriteria": "open", + "Tags": [], + "EphemeralStorage": false, + "CreateDate": "2019-08-07T11:34:19.000Z", + "AvailableInstanceCount": 3, + "InstancePlatform": "Linux/UNIX", + "TotalInstanceCount": 3, + "State": "cancelled", + "Tenancy": "default", + "EbsOptimized": true, + "InstanceType": "m5.large" + } + """ + + def __init__(self, capacity_reservation_data): + self.capacity_reservation_data = capacity_reservation_data + + def capacity_reservation_id(self): + """Return the id of the Capacity Reservation.""" + return self.capacity_reservation_data.get("CapacityReservationId") + + def state(self): + """Return the state of the Capacity Reservation.""" + return self.capacity_reservation_data.get("State") + + +class CapacityBlockReservationInfo(CapacityReservationInfo): + """ + Data object wrapping the result of a describe-capacity-reservations --capacity-type capacity-block call. + + { "CapacityReservationId": "cr-a1234567", + "EndDateType": "limited", + "ReservationType": "capacity-block", + "AvailabilityZone": "eu-east-2a", + "InstanceMatchCriteria": "targeted", + "EphemeralStorage": false, + "CreateDate": "2023-07-29T14:22:45Z ", + “StartDate": "2023-08-15T12:00:00Z", + “EndDate": "2023-08-19T12:00:00Z", + "AvailableInstanceCount": 0, + "InstancePlatform": "Linux/UNIX", + "TotalInstanceCount": 16, + “State": "payment-pending", + "Tenancy": "default", + "EbsOptimized": true, + "InstanceType": "p5.48xlarge“ + } + """ + + def __init__(self, capacity_reservation_data): + super().__init__(capacity_reservation_data) + + def start_date(self): + """Return the start date of the CB.""" + return self.capacity_reservation_data.get("StartDate") + + def end_date(self): + """Return the start date of the CB.""" + return self.capacity_reservation_data.get("EndDate") + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + +class Ec2Client(Boto3Client): + """Implement EC2 Boto3 client.""" + + def __init__(self, config=None): + super().__init__("ec2", config=config) + + @AWSExceptionHandler.handle_client_exception + def describe_capacity_reservations( + self, capacity_reservation_ids: List[str], reservation_type=None + ) -> List[CapacityBlockReservationInfo]: + """Accept a space separated list of ids. Return a list of CapacityReservationInfo.""" + result = [] + response = list( + self._paginate_results( + self._client.describe_capacity_reservations, + CapacityReservationIds=capacity_reservation_ids, + ReservationType=reservation_type, + ) + ) + for capacity_reservation in response: + result.append(CapacityBlockReservationInfo(capacity_reservation)) + + return result diff --git a/src/common/time_utils.py b/src/common/time_utils.py index cac1ab18a..c5337073e 100644 --- a/src/common/time_utils.py +++ b/src/common/time_utils.py @@ -19,3 +19,8 @@ def minutes(min): def seconds(sec): """Convert seconds to milliseconds.""" return sec * 1000 + + +def seconds_to_minutes(value): + """Convert seconds to minutes.""" + return int(value / 60) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py new file mode 100644 index 000000000..0856ba9d8 --- /dev/null +++ b/src/slurm_plugin/capacity_block_manager.py @@ -0,0 +1,326 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import logging +from datetime import datetime, timezone +from enum import Enum +from typing import Dict, List + +from common.schedulers.slurm_reservation_commands import ( + create_slurm_reservation, + delete_slurm_reservation, + does_slurm_reservation_exist, + update_slurm_reservation, +) +from common.time_utils import seconds_to_minutes +from slurm_plugin.slurm_resources import SlurmNode + +from aws.ec2 import CapacityBlockReservationInfo, Ec2Client + +logger = logging.getLogger(__name__) + +# Time in minutes to wait to retrieve Capacity Block Reservation information from EC2 +CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD = 10 +SLURM_RESERVATION_NAME_PREFIX = "pcluster-" + + +class CapacityType(Enum): + """Enum to identify the type compute supported by the queues.""" + + CAPACITY_BLOCK = "capacity-block" + ONDEMAND = "on-demand" + SPOT = "spot" + + +class CapacityBlock: + """ + Class to store Capacity Block info from EC2 and fleet config. + + Contains info like: + - queue and compute resource from the config, + - state from EC2, + - name of the related slurm reservation. + """ + + def __init__(self, capacity_block_id, queue_name, compute_resource_name): + self.capacity_block_id = capacity_block_id + self.queue_name = queue_name + self.compute_resource_name = compute_resource_name + self._capacity_block_reservation_info = None + self._nodenames = [] + + def update_ec2_info(self, capacity_block_reservation_info: CapacityBlockReservationInfo): + """Update info from CapacityBlockReservationInfo.""" + self._capacity_block_reservation_info = capacity_block_reservation_info + + def slurm_reservation_name(self): + """Retrieve slurm reservation associated.""" + return f"{SLURM_RESERVATION_NAME_PREFIX}{self.capacity_block_id}" + + def add_nodename(self, nodename: str): + """Add node name to the list of nodenames associated to the capacity block.""" + self._nodenames.append(nodename) + + def nodenames(self): + """Return list of nodenames associated with the capacity block.""" + return self._nodenames + + def state(self): + """Return state of the CB: payment-pending, pending, active, expired, and payment-failed.""" + return self._capacity_block_reservation_info.state() + + def is_active(self): + """Return true if CB is in active state.""" + return self.state() == "active" + + def does_node_belong_to(self, node): + """Return true if the node belongs to the CB.""" + return node.queue_name == self.queue_name and node.compute_resource_name == self.compute_resource_name + + @staticmethod + def slurm_reservation_name_to_id(slurm_reservation_name: str): + """Parse slurm reservation name to retrieve related capacity block id.""" + return slurm_reservation_name[len(SLURM_RESERVATION_NAME_PREFIX) :] # noqa E203 + + @staticmethod + def is_capacity_block_slurm_reservation(slurm_reservation_name: str): + """Return true if slurm reservation name is related to a CB, if it matches a specific internal convention.""" + return slurm_reservation_name.startswith(SLURM_RESERVATION_NAME_PREFIX) + + def __eq__(self, other): + return ( + self.capacity_block_id == other.capacity_block_id + and self.queue_name == other.queue_name + and self.compute_resource_name == other.compute_resource_name + ) + + +class CapacityBlockManager: + """Capacity Block Reservation Manager.""" + + def __init__(self, region, fleet_config, boto3_config): + self._region = region + self._fleet_config = fleet_config + self._boto3_config = boto3_config + self._ec2_client = None + # internal variables to store Capacity Block info from fleet config and EC2 + self._capacity_blocks: Dict[str, CapacityBlock] = {} + self._capacity_blocks_update_time = None + self._reserved_nodenames: List[str] = [] + + @property + def ec2_client(self): + if not self._ec2_client: + self._ec2_client = Ec2Client(config=self._boto3_config) + return self._ec2_client + + def get_reserved_nodenames(self, nodes: List[SlurmNode]): + """Manage nodes part of capacity block reservation. Returns list of reserved nodes.""" + # evaluate if it's the moment to update info + is_time_to_update = ( + self._capacity_blocks_update_time + and seconds_to_minutes(datetime.now(tz=timezone.utc) - self._capacity_blocks_update_time) + > CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD + ) + if is_time_to_update: # TODO: evaluate time to update accordingly to capacity block start time + reserved_nodenames = [] + + # update capacity blocks details from ec2 (e.g. state) + self._update_capacity_blocks_info_from_ec2() + # associate nodenames to capacity blocks, according to queues and compute resources from fleet configuration + self._associate_nodenames_to_capacity_blocks(nodes) + + # create, update or delete slurm reservation for the nodes according to CB details. + for capacity_block in self._capacity_blocks.values(): + reserved_nodenames.extend(self._update_slurm_reservation(capacity_block)) + self._reserved_nodenames = reserved_nodenames + + # delete slurm reservations created by CapacityBlockManager not associated to existing capacity blocks, + # by checking slurm reservation info of the given nodes + self._cleanup_leftover_slurm_reservations(nodes) + + return self._reserved_nodenames + + def _associate_nodenames_to_capacity_blocks(self, nodes: List[SlurmNode]): + """ + Update capacity_block info adding nodenames list. + + Check configured CBs and associate nodes to them according to queue and compute resource info. + """ + for node in nodes: + capacity_block: CapacityBlock + for capacity_block in self._capacity_blocks.values(): + if capacity_block.does_node_belong_to(node): + capacity_block.add_nodename(node.name) + break + + def _cleanup_leftover_slurm_reservations(self, nodes: List[StaticNode]): + """Find list of slurm reservations associated to the nodes not part of the configured CBs.""" + for node in nodes: + if node.reservation_name: + # nodes with a slurm reservation already in place + slurm_reservation_name = node.reservation_name + if CapacityBlock.is_capacity_block_slurm_reservation(slurm_reservation_name): + capacity_block_id = CapacityBlock.slurm_reservation_name_to_id(slurm_reservation_name) + if capacity_block_id not in self._capacity_blocks.keys(): + logger.info( + ( + "Found leftover slurm reservation %s. " + "Related Capacity Block %s is no longer in the cluster configuration. Deleting it." + ), + slurm_reservation_name, + capacity_block_id, + ) + delete_slurm_reservation(name=slurm_reservation_name) + else: + logger.debug( + "Slurm reservation %s is not managed by ParallelCluster. Skipping it.", slurm_reservation_name + ) + + @staticmethod + def _update_slurm_reservation(capacity_block: CapacityBlock): + """ + Update Slurm reservation associated to the given Capacity Block. + + A CB has five possible states: payment-pending, pending, active, expired and payment-failed, + we need to create/delete Slurm reservation accordingly. + + returns list of nodes reserved for that capacity block, if it's not active. + """ + + def _log_cb_info(action_info): + logger.info( + "Capacity Block reservation %s is in state %s. %s Slurm reservation %s for nodes %s.", + capacity_block.capacity_block_id, + capacity_block.state(), + action_info, + slurm_reservation_name, + capacity_block_nodenames, + ) + + nodes_in_slurm_reservation = [] + + # retrieve list of nodes associated to a given slurm reservation/capacity block + slurm_reservation_name = capacity_block.slurm_reservation_name() + capacity_block_nodes = capacity_block.nodenames() + capacity_block_nodenames = ",".join(capacity_block_nodes) + + reservation_exists = does_slurm_reservation_exist(name=slurm_reservation_name) + # if CB is active we need to remove Slurm reservation and start nodes + if capacity_block.is_active(): + # if Slurm reservation exists, delete it. + if reservation_exists: + _log_cb_info("Deleting related") + delete_slurm_reservation(name=slurm_reservation_name) + else: + _log_cb_info("Nothing to do. No existing") + + # if CB is expired or not active we need to (re)create Slurm reservation + # to avoid considering nodes as unhealthy + else: + nodes_in_slurm_reservation = capacity_block_nodes + # create or update Slurm reservation + if reservation_exists: + _log_cb_info("Updating existing related") + update_slurm_reservation(name=slurm_reservation_name, nodes=capacity_block_nodenames) + else: + _log_cb_info("Creating related") + create_slurm_reservation( + name=slurm_reservation_name, + start_time=datetime.now(tz=timezone.utc), + nodes=capacity_block_nodenames, + ) + + return nodes_in_slurm_reservation + + def _update_capacity_blocks_info_from_ec2(self): + """ + Store in the _capacity_reservations a dict for CapacityReservation info. + + This method is called every time the CapacityBlockManager is re-initialized, + so when it starts/is restarted or when fleet configuration changes. + """ + # Retrieve updated capacity reservation information at initialization time, and every tot minutes + self._capacity_blocks = self._capacity_blocks_from_config() + + if self._capacity_blocks: + capacity_block_ids = self._capacity_blocks.keys() + logger.info( + "Retrieving updated Capacity Block reservation information from EC2 for %s", + ",".join(capacity_block_ids), + ) + capacity_block_reservations_info: List[ + CapacityBlockReservationInfo + ] = self.ec2_client().describe_capacity_reservations(capacity_block_ids) + + for capacity_block_reservation_info in capacity_block_reservations_info: + capacity_block_id = capacity_block_reservation_info.capacity_reservation_id() + self._capacity_blocks[capacity_block_id].update_ec2_info(capacity_block_reservation_info) + + self._capacity_blocks_update_time = datetime.now(tz=timezone.utc) + + def _capacity_blocks_from_config(self): + """ + Collect list of capacity reservation target from all queues/compute-resources in the fleet config. + + Fleet config json has the following format: + { + "my-queue": { + "my-compute-resource": { + "Api": "create-fleet", + "CapacityType": "on-demand|spot|capacity-block", + "AllocationStrategy": "lowest-price|capacity-optimized|use-capacity-reservations-first", + "Instances": [ + { "InstanceType": "p4d.24xlarge" } + ], + "MaxPrice": "", + "Networking": { + "SubnetIds": ["subnet-123456"] + }, + "CapacityReservationId": "id" + } + } + } + """ + capacity_blocks: Dict[str, CapacityBlock] = {} + logger.info("Retrieving Capacity Block reservation information for fleet config.") + + for queue_name, queue_config in self._fleet_config.items(): + for compute_resource_name, compute_resource_config in queue_config.items(): + if self._is_compute_resource_associated_to_capacity_block(compute_resource_config): + capacity_block_id = self._capacity_reservation_id_from_compute_resource_config( + compute_resource_config + ) + capacity_block = CapacityBlock( + capacity_block_id=capacity_block_id, + queue_name=queue_name, + compute_resource_name=compute_resource_name, + ) + capacity_blocks.update({capacity_block_id: capacity_block}) + + return capacity_blocks + + @staticmethod + def _is_compute_resource_associated_to_capacity_block(compute_resource_config): + """Return True if compute resource is associated to a Capacity Block reservation.""" + capacity_type = compute_resource_config.get("CapacityType", CapacityType.ONDEMAND) + return capacity_type == CapacityType.CAPACITY_BLOCK.value + + @staticmethod + def _capacity_reservation_id_from_compute_resource_config(compute_resource_config): + """Return capacity reservation target if present, None otherwise.""" + try: + return compute_resource_config["CapacityReservationId"] + except KeyError as e: + # This should never happen because this file is created by cookbook config parser + logger.error( + "Unable to retrieve CapacityReservationId from compute resource info: %s", compute_resource_config + ) + raise e diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py index 333c0b881..89758f54f 100644 --- a/src/slurm_plugin/clustermgtd.py +++ b/src/slurm_plugin/clustermgtd.py @@ -40,6 +40,7 @@ from common.time_utils import seconds from common.utils import check_command_output, read_json, sleep_remaining_loop_time, time_is_up, wait_remaining_time from retrying import retry +from slurm_plugin.capacity_block_manager import CapacityBlockManager from slurm_plugin.cluster_event_publisher import ClusterEventPublisher from slurm_plugin.common import TIMESTAMP_FORMAT, log_exception, print_with_count from slurm_plugin.console_logger import ConsoleLogger @@ -382,6 +383,7 @@ def __init__(self, config): self._console_logger = None self._event_publisher = None self._partition_nodelist_mapping_instance = None + self._capacity_block_manager = None self.set_config(config) def set_config(self, config: ClustermgtdConfig): @@ -406,6 +408,7 @@ def set_config(self, config: ClustermgtdConfig): self._compute_fleet_status_manager = ComputeFleetStatusManager() self._instance_manager = self._initialize_instance_manager(config) self._console_logger = self._initialize_console_logger(config) + self._capacity_block_manager = self._initialize_capacity_block_manager(config) def shutdown(self): if self._task_executor: @@ -454,6 +457,12 @@ def _initialize_console_logger(config): ), ) + @staticmethod + def _initialize_capacity_block_manager(config): + return CapacityBlockManager( + region=config.region, fleet_config=config.fleet_config, boto3_config=config.boto3_config + ) + def _update_compute_fleet_status(self, status): log.info("Updating compute fleet status from %s to %s", self._compute_fleet_status, status) self._compute_fleet_status_manager.update_status(status) @@ -820,7 +829,12 @@ def _handle_unhealthy_static_nodes(self, unhealthy_static_nodes): except Exception as e: log.error("Encountered exception when retrieving console output from unhealthy static nodes: %s", e) - node_list = [node.name for node in unhealthy_static_nodes] + # Remove the nodes part of the unactive Capacity Block reservations from the list of unhealthy nodes. + # nodes from active Capacity Block reservation will be instead managed as standard static instances. + reserved_unhealthy_nodenames = self._capacity_block_manager.get_reserved_nodenames(unhealthy_static_nodes) + log.info("Removing reserved nodes %s from list of unhealthy nodes", ",".join(reserved_unhealthy_nodenames)) + node_list = [node.name for node in unhealthy_static_nodes if node.name not in reserved_unhealthy_nodenames] + # Set nodes into down state so jobs can be requeued immediately try: log.info("Setting unhealthy static nodes to DOWN") @@ -866,6 +880,7 @@ def _maintain_nodes(self, partitions_name_map, compute_resource_nodes_map): A list of slurm nodes is passed in and slurm node map with IP/nodeaddr as key should be avoided. """ log.info("Performing node maintenance actions") + # Retrieve nodes from Slurm partitions in ACTIVE state active_nodes = self._find_active_nodes(partitions_name_map) # Update self.static_nodes_in_replacement by removing from the set any node that is up or in maintenance @@ -873,7 +888,10 @@ def _maintain_nodes(self, partitions_name_map, compute_resource_nodes_map): log.info( "Following nodes are currently in replacement: %s", print_with_count(self._static_nodes_in_replacement) ) + # terminate powering down instances self._handle_powering_down_nodes(active_nodes) + + # retrieve and manage unhealthy nodes ( unhealthy_dynamic_nodes, unhealthy_static_nodes, @@ -885,6 +903,8 @@ def _maintain_nodes(self, partitions_name_map, compute_resource_nodes_map): if unhealthy_static_nodes: log.info("Found the following unhealthy static nodes: %s", print_with_count(unhealthy_static_nodes)) self._handle_unhealthy_static_nodes(unhealthy_static_nodes) + + # evaluate partitions to put in protected mode and ICEs nodes to terminate if self._is_protected_mode_enabled(): self._handle_protected_mode_process(active_nodes, partitions_name_map) if self._config.disable_nodes_on_insufficient_capacity: @@ -1030,6 +1050,7 @@ def _handle_nodes_failing_health_check(self, nodes_failing_health_check: List[Sl ) def _handle_failed_health_check_nodes_in_replacement(self, active_nodes): + """Detect failed health checks for static nodes in replacement, updating _static_nodes_in_replacement attr.""" failed_health_check_nodes_in_replacement = [] for node in active_nodes: if node.is_static_nodes_in_replacement and node.is_failing_health_check: diff --git a/src/slurm_plugin/slurm_resources.py b/src/slurm_plugin/slurm_resources.py index 8f3fca5d1..5a00f550d 100644 --- a/src/slurm_plugin/slurm_resources.py +++ b/src/slurm_plugin/slurm_resources.py @@ -213,7 +213,7 @@ def __init__( instance=None, slurmdstarttime: datetime = None, lastbusytime: datetime = None, - reservation_name=None, + reservation_name: str = None, ): """Initialize slurm node with attributes.""" self.name = name diff --git a/tests/common/test_time_utils.py b/tests/common/test_time_utils.py new file mode 100644 index 000000000..e931ca2a5 --- /dev/null +++ b/tests/common/test_time_utils.py @@ -0,0 +1,18 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import pytest +from assertpy import assert_that +from common.time_utils import seconds_to_minutes + + +@pytest.mark.parametrize("value_in_seconds, expected_output", [(0, 0), (12, 0), (60, 1), (66, 1), (1202, 20)]) +def test_seconds_to_minutes(value_in_seconds, expected_output): + assert_that(seconds_to_minutes(value_in_seconds)).is_equal_to(expected_output) diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py new file mode 100644 index 000000000..d981022a5 --- /dev/null +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -0,0 +1,469 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with +# the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import logging +from datetime import datetime +from unittest.mock import call + +import pytest +from assertpy import assert_that +from slurm_plugin.capacity_block_manager import SLURM_RESERVATION_NAME_PREFIX, CapacityBlock, CapacityBlockManager +from slurm_plugin.slurm_resources import StaticNode + +from aws.ec2 import CapacityBlockReservationInfo + +FAKE_CAPACITY_BLOCK_ID = "cr-a1234567" +FAKE_CAPACITY_BLOCK_INFO = { + "CapacityReservationId": FAKE_CAPACITY_BLOCK_ID, + "EndDateType": "limited", + "ReservationType": "capacity-block", + "AvailabilityZone": "eu-east-2a", + "InstanceMatchCriteria": "targeted", + "EphemeralStorage": False, + "CreateDate": "2023-07-29T14:22:45Z ", + "StartDate": "2023-08-15T12:00:00Z", + "EndDate": "2023-08-19T12:00:00Z", + "AvailableInstanceCount": 0, + "InstancePlatform": "Linux/UNIX", + "TotalInstanceCount": 16, + "State": "payment-pending", + "Tenancy": "default", + "EbsOptimized": True, + "InstanceType": "p5.48xlarge", +} + + +@pytest.fixture +def capacity_block(): + return CapacityBlock(FAKE_CAPACITY_BLOCK_ID, "queue-cb", "compute-resource-cb") + + +class TestCapacityBlock: + @pytest.mark.parametrize( + ("state", "expected_output"), + [("active", True), ("anything-else", False)], + ) + def test_is_active(self, capacity_block, state, expected_output): + capacity_block_reservation_info = CapacityBlockReservationInfo({**FAKE_CAPACITY_BLOCK_INFO, "State": state}) + capacity_block.update_ec2_info(capacity_block_reservation_info) + assert_that(capacity_block.is_active()).is_equal_to(expected_output) + + @pytest.mark.parametrize( + "nodes_to_add", + [(["node1"]), (["node1", "node2"])], + ) + def test_add_nodename(self, capacity_block, nodes_to_add): + for nodename in nodes_to_add: + capacity_block.add_nodename(nodename) + assert_that(capacity_block.nodenames()).is_equal_to(nodes_to_add) + + @pytest.mark.parametrize( + ("node", "expected_output"), + [ + (StaticNode("queue-cb-st-compute-resource-cb-4", "ip-1", "hostname-1", "some_state", "queue-cb"), True), + (StaticNode("queue1-st-c5xlarge-4", "ip-1", "hostname-1", "some_state", "queue1"), False), + (StaticNode("queue2-st-compute-resource1-4", "ip-1", "hostname-1", "some_state", "queue2"), False), + ], + ) + def test_does_node_belong_to(self, capacity_block, node, expected_output): + assert_that(capacity_block.does_node_belong_to(node)).is_equal_to(expected_output) + + @pytest.mark.parametrize( + ("reservation_name", "expected_output"), + [("test", ""), (f"{SLURM_RESERVATION_NAME_PREFIX}anything-else", "anything-else")], + ) + def test_slurm_reservation_name_to_id(self, reservation_name, expected_output): + assert_that(CapacityBlock.slurm_reservation_name_to_id(reservation_name)).is_equal_to(expected_output) + + @pytest.mark.parametrize( + ("reservation_name", "expected_output"), + [("test", False), (f"{SLURM_RESERVATION_NAME_PREFIX}anything-else", True)], + ) + def test_is_capacity_block_slurm_reservation(self, reservation_name, expected_output): + assert_that(CapacityBlock.is_capacity_block_slurm_reservation(reservation_name)).is_equal_to(expected_output) + + +class TestCapacityBlockManager: + @pytest.fixture + def capacity_block_manager(self): + return CapacityBlockManager("eu-west-2", {}, "fake_boto3_config") + + def test_ec2_client(self, capacity_block_manager, mocker): + ec2_mock = mocker.patch("slurm_plugin.capacity_block_manager.Ec2Client", return_value=mocker.MagicMock()) + capacity_block_manager.ec2_client() + ec2_mock.assert_called_with(config="fake_boto3_config") + capacity_block_manager.ec2_client() + ec2_mock.assert_called_once() + + @pytest.mark.parametrize( + ("capacity_blocks", "nodes", "expected_nodenames_in_capacity_block"), + [ + ( + { + "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), + "cr-234567": CapacityBlock("id2", "queue-cb2", "compute-resource-cb2"), + }, + [ + StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb"), + StaticNode("queue1-st-compute-resource1-2", "ip-1", "hostname-1", "some_state", "queue1"), + StaticNode("queue-cb-st-compute-resource-cb-3", "ip-1", "hostname-1", "some_state", "queue-cb"), + StaticNode("queue1-st-compute-resource1-4", "ip-1", "hostname-1", "some_state", "queue1"), + StaticNode("queue-cb2-st-compute-resource-cb2-5", "ip-1", "hostname-1", "some_state", "queue-cb2"), + StaticNode("queue-cb-st-othercr-6", "ip-1", "hostname-1", "some_state", "queue1"), + StaticNode("otherqueue-st-compute-resource-cb-7", "ip-1", "hostname-1", "some_state", "otherqueue"), + ], + { + "cr-123456": ["queue-cb-st-compute-resource-cb-1", "queue-cb-st-compute-resource-cb-3"], + "cr-234567": ["queue-cb2-st-compute-resource-cb2-5"], + }, + ) + ], + ) + def test_associate_nodenames_to_capacity_blocks( + self, + capacity_block_manager, + capacity_blocks, + nodes, + expected_nodenames_in_capacity_block, + ): + capacity_block_manager._capacity_blocks = capacity_blocks + capacity_block_manager._associate_nodenames_to_capacity_blocks(nodes) + + for capacity_block_id in capacity_block_manager._capacity_blocks.keys(): + # assert in the nodenames list there are only nodes associated to the right queue and compute resource + assert_that(capacity_block_manager._capacity_blocks.get(capacity_block_id).nodenames()).is_equal_to( + expected_nodenames_in_capacity_block.get(capacity_block_id) + ) + + @pytest.mark.parametrize( + ("nodes", "expected_leftover_slurm_reservations"), + [ + ( + [ + # node without a slurm reservation -> skipped + StaticNode("queue1-st-compute-resource1-1", "ip-1", "hostname-1", "some_state", "queue1"), + # node with a reservation from the customer -> skipped + StaticNode( + "queue4-st-compute-resource1-1", + "ip-1", + "hostname-1", + "some_state", + "queue4", + reservation_name="other-reservation", + ), + # node associated with CB, not yet part of slurm reservation -> skipped + StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb"), + # node with a reservation associated with CB -> skipped + StaticNode( + "queue-cb-st-compute-resource-cb-2", + "ip-1", + "hostname-1", + "some_state", + "queue-cb", + reservation_name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-123456", + ), + # node with a reservation associated with an old CB but in the queue/cr associated to a new CB, + # this is a leftover reservation + StaticNode( + "queue-cb-st-compute-resource-cb-3", + "ip-1", + "hostname-1", + "some_state", + "queue-cb", + reservation_name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-876543", + ), + # node with a reservation associated with existing CB, + # but from an older queue/cr no longer in config -> skipped + StaticNode( + "queue2-st-compute-resource1-1", + "ip-1", + "hostname-1", + "some_state", + "queue2", + reservation_name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-123456", + ), + # node associated with another old CB, no longer in the config, this is a leftover reservation + StaticNode( + "queue3-st-compute-resource1-1", + "ip-1", + "hostname-1", + "some_state", + "queue3", + reservation_name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-987654", + ), + ], + [f"{SLURM_RESERVATION_NAME_PREFIX}cr-876543", f"{SLURM_RESERVATION_NAME_PREFIX}cr-987654"], + ) + ], + ) + def test_cleanup_leftover_slurm_reservations( + self, + mocker, + capacity_block_manager, + capacity_block, + nodes, + expected_leftover_slurm_reservations, + ): + # only cr-123456, queue-cb, compute-resource-cb is in the list of capacity blocks from config + capacity_block_manager._capacity_blocks = {"cr-123456": capacity_block} + delete_res_mock = mocker.patch("slurm_plugin.capacity_block_manager.delete_slurm_reservation") + capacity_block_manager._cleanup_leftover_slurm_reservations(nodes) + + # verify that only the slurm reservation associated with a CB, no longer in the config, + # are considered as leftover + expected_calls = [] + for slurm_reservation in expected_leftover_slurm_reservations: + expected_calls.append(call(name=slurm_reservation)) + delete_res_mock.assert_has_calls(expected_calls) + + @pytest.mark.parametrize( + ( + "state", + "reservation_exists", + "expected_create_res_call", + "expected_update_res_call", + "expected_delete_res_call", + ), + [ + ("pending", False, True, False, False), + ("pending", True, False, True, False), + ("active", False, False, False, False), + ("active", True, False, False, True), + ], + ) + def test_update_slurm_reservation( + self, + mocker, + capacity_block_manager, + capacity_block, + state, + reservation_exists, + expected_create_res_call, + expected_update_res_call, + expected_delete_res_call, + caplog, + ): + caplog.set_level(logging.INFO) + capacity_block_reservation_info = CapacityBlockReservationInfo({**FAKE_CAPACITY_BLOCK_INFO, "State": state}) + capacity_block.update_ec2_info(capacity_block_reservation_info) + capacity_block.add_nodename("node1") + capacity_block.add_nodename("node2") + nodenames = ",".join(capacity_block.nodenames()) + slurm_reservation_name = f"{SLURM_RESERVATION_NAME_PREFIX}{FAKE_CAPACITY_BLOCK_ID}" + check_res_mock = mocker.patch( + "slurm_plugin.capacity_block_manager.does_slurm_reservation_exist", return_value=reservation_exists + ) + create_res_mock = mocker.patch("slurm_plugin.capacity_block_manager.create_slurm_reservation") + update_res_mock = mocker.patch("slurm_plugin.capacity_block_manager.update_slurm_reservation") + delete_res_mock = mocker.patch("slurm_plugin.capacity_block_manager.delete_slurm_reservation") + expected_start_time = datetime(2020, 1, 1, 0, 0, 0) + mocker.patch("slurm_plugin.capacity_block_manager.datetime").now.return_value = expected_start_time + + capacity_block_manager._update_slurm_reservation(capacity_block) + + # check the right commands to create/delete/update reservations are called accordingly to the state + check_res_mock.assert_called_with(name=slurm_reservation_name) + msg_prefix = f"Capacity Block reservation {FAKE_CAPACITY_BLOCK_ID} is in state {state}. " + msg_suffix = f" Slurm reservation {slurm_reservation_name} for nodes {nodenames}." + + # when state is != active + if expected_create_res_call: + create_res_mock.assert_called_with( + name=slurm_reservation_name, start_time=expected_start_time, nodes=nodenames + ) + assert_that(caplog.text).contains(msg_prefix + "Creating related" + msg_suffix) + if expected_update_res_call: + update_res_mock.assert_called_with(name=slurm_reservation_name, nodes=nodenames) + assert_that(caplog.text).contains(msg_prefix + "Updating existing related" + msg_suffix) + + # when state is active + if expected_delete_res_call: + delete_res_mock.assert_called_with(name=slurm_reservation_name) + assert_that(caplog.text).contains(msg_prefix + "Deleting related" + msg_suffix) + + if state == "active" and not reservation_exists: + assert_that(caplog.text).contains(msg_prefix + "Nothing to do. No existing" + msg_suffix) + + @pytest.mark.parametrize( + ( + "init_capacity_blocks", + "capacity_blocks_from_config", + "capacity_blocks_info_from_ec2", + "expected_new_capacity_blocks", + "expected_new_update_time", + ), + [ + # nothing in the config, just change update time + ({}, {}, [], {}, True), + # new config without info, remove old block from the map + ( + { + "cr-987654": CapacityBlock("id", "queue-cb", "compute-resource-cb"), + }, + {}, + [], + {}, + True, + ), + # update old values with new values + ( + { + "cr-987654": CapacityBlock("id", "queue-cb", "compute-resource-cb"), + }, + { + "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), + "cr-234567": CapacityBlock("id2", "queue-cb2", "compute-resource-cb2"), + }, + [ + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} + ), + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} + ), + ], + { + "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), + "cr-234567": CapacityBlock("id2", "queue-cb2", "compute-resource-cb2"), + }, + True, + ), + ], + ) + def test_update_capacity_blocks_info_from_ec2( + self, + mocker, + capacity_block_manager, + capacity_blocks_from_config, + init_capacity_blocks, + expected_new_capacity_blocks, + capacity_blocks_info_from_ec2, + expected_new_update_time, + ): + mocker.patch.object( + capacity_block_manager, "_capacity_blocks_from_config", return_value=capacity_blocks_from_config + ) + mocked_now = datetime(2020, 1, 1, 0, 0, 0) + mocker.patch("slurm_plugin.capacity_block_manager.datetime").now.return_value = mocked_now + capacity_block_manager._capacity_blocks = init_capacity_blocks + + mocked_client = mocker.MagicMock() + mocked_client.return_value.describe_capacity_reservations.return_value = capacity_blocks_info_from_ec2 + capacity_block_manager._ec2_client = mocked_client + + capacity_block_manager._update_capacity_blocks_info_from_ec2() + + assert_that(expected_new_capacity_blocks).is_equal_to(capacity_block_manager._capacity_blocks) + assert_that(capacity_block_manager._capacity_blocks_update_time).is_equal_to(mocked_now) + if expected_new_capacity_blocks: + # verify that all the blocks have the updated info from ec2 + assert_that( + capacity_block_manager._capacity_blocks.get("cr-123456")._capacity_block_reservation_info + ).is_equal_to( + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} + ) + ) + assert_that( + capacity_block_manager._capacity_blocks.get("cr-234567")._capacity_block_reservation_info + ).is_equal_to( + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} + ) + ) + + @pytest.mark.parametrize( + ("fleet_config", "expected_capacity_blocks", "expected_exception"), + [ + ({}, {}, False), + ( + { + "queue-cb": { + "compute-resource-cb": { + "CapacityType": "capacity-block", + "CapacityReservationId": "cr-123456", + } + }, + "queue1": { + "compute-resource1": {"CapacityType": "on-demand", "CapacityReservationId": "cr-123456"} + }, + "queue-cb2": { + "compute-resource-cb2": { + "CapacityType": "capacity-block", + "CapacityReservationId": "cr-234567", + } + }, + }, + { + "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), + "cr-234567": CapacityBlock("cr-234567", "queue-cb2", "compute-resource-cb2"), + }, + False, + ), + ( + {"broken-queue-without-id": {"compute-resource-cb": {"CapacityType": "capacity-block"}}}, + {}, + True, + ), + ( + {"queue-with-cr-id-but-no-cb": {"compute-resource-cb": {"CapacityReservationId": "cr-123456"}}}, + {}, + False, + ), + ], + ) + def test_capacity_blocks_from_config( + self, capacity_block_manager, fleet_config, expected_capacity_blocks, expected_exception + ): + capacity_block_manager._fleet_config = fleet_config + + if expected_exception: + with pytest.raises(KeyError): + capacity_block_manager._capacity_blocks_from_config() + else: + assert_that(expected_capacity_blocks).is_equal_to(capacity_block_manager._capacity_blocks_from_config()) + + @pytest.mark.parametrize( + ("compute_resource_config", "expected_result"), + [ + ({}, False), + ({"CapacityType": "spot"}, False), + ({"CapacityType": "on-demand"}, False), + ({"CapacityType": "capacity-block"}, True), + ], + ) + def test__is_compute_resource_associated_to_capacity_block( + self, capacity_block_manager, compute_resource_config, expected_result + ): + assert_that( + capacity_block_manager._is_compute_resource_associated_to_capacity_block(compute_resource_config) + ).is_equal_to(expected_result) + + @pytest.mark.parametrize( + ("compute_resource_config", "expected_result", "expected_exception"), + [ + ({}, False, True), + ({"CapacityType": "spot"}, False, True), + ({"CapacityType": "spot", "CapacityReservationId": "cr-123456"}, "cr-123456", False), + ({"CapacityType": "on-demand", "CapacityReservationId": "cr-123456"}, "cr-123456", False), + ({"CapacityType": "capacity-block"}, True, True), + ({"CapacityType": "capacity-block", "CapacityReservationId": "cr-123456"}, "cr-123456", False), + ], + ) + def test_capacity_reservation_id_from_compute_resource_config( + self, capacity_block_manager, compute_resource_config, expected_result, expected_exception + ): + if expected_exception: + with pytest.raises(KeyError): + capacity_block_manager._capacity_reservation_id_from_compute_resource_config(compute_resource_config) + else: + assert_that( + capacity_block_manager._capacity_reservation_id_from_compute_resource_config(compute_resource_config) + ).is_equal_to(expected_result) diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py index 1cc45f297..17bb646f3 100644 --- a/tests/slurm_plugin/test_clustermgtd.py +++ b/tests/slurm_plugin/test_clustermgtd.py @@ -203,8 +203,23 @@ def test_set_config(initialize_instance_manager_mock): worker_pool_max_backlog=10, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) - updated_config = SimpleNamespace( + updated_config_1 = SimpleNamespace( + some_key_1="some_value_1", + some_key_2="some_value_2", + insufficient_capacity_timeout=20, + worker_pool_size=5, + worker_pool_max_backlog=10, + cluster_name="cluster", + head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={"queue1": {"cr1": {"test": "test"}}}, + ) + updated_config_2 = SimpleNamespace( some_key_1="some_value_1", some_key_2="some_value_2_changed", insufficient_capacity_timeout=10, @@ -212,16 +227,21 @@ def test_set_config(initialize_instance_manager_mock): worker_pool_max_backlog=5, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={"queue1": {"cr1": {"test": "test"}}}, ) cluster_manager = ClusterManager(initial_config) assert_that(cluster_manager._config).is_equal_to(initial_config) cluster_manager.set_config(initial_config) assert_that(cluster_manager._config).is_equal_to(initial_config) - cluster_manager.set_config(updated_config) - assert_that(cluster_manager._config).is_equal_to(updated_config) + cluster_manager.set_config(updated_config_1) + assert_that(cluster_manager._config).is_equal_to(updated_config_1) + cluster_manager.set_config(updated_config_2) + assert_that(cluster_manager._config).is_equal_to(updated_config_2) - assert_that(initialize_instance_manager_mock.call_count).is_equal_to(2) + assert_that(initialize_instance_manager_mock.call_count).is_equal_to(3) @pytest.mark.usefixtures( @@ -238,6 +258,9 @@ def test_exception_from_report_console_output_from_nodes(mocker): compute_console_wait_time=10, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) unhealthy_nodes = [ StaticNode("queue1-st-c5xlarge-3", "ip-3", "hostname", "some_state", "queue1"), @@ -344,6 +367,9 @@ def test_clean_up_inactive_partition( insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) part = SlurmPartition("partition4", "placeholder_nodes", "INACTIVE") @@ -705,6 +731,9 @@ def test_handle_health_check( insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) @@ -771,6 +800,9 @@ def test_update_static_nodes_in_replacement(current_replacing_nodes, slurm_nodes insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) cluster_manager._static_nodes_in_replacement = current_replacing_nodes @@ -880,6 +912,9 @@ def test_handle_unhealthy_dynamic_nodes( disable_nodes_on_insufficient_capacity=disable_nodes_on_insufficient_capacity, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) mock_instance_manager = cluster_manager._instance_manager @@ -936,6 +971,9 @@ def test_handle_powering_down_nodes( insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) mock_instance_manager = cluster_manager._instance_manager @@ -1310,6 +1348,9 @@ def test_maintain_nodes( disable_nodes_on_insufficient_capacity=disable_nodes_on_insufficient_capacity, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) cluster_manager._static_nodes_in_replacement = static_nodes_in_replacement @@ -1558,6 +1599,7 @@ def test_manage_cluster( node_replacement_timeout=1800, terminate_max_batch_size=1, head_node_instance_id="i-instance-id", + fleet_config={}, ) mocker.patch("time.sleep") cluster_manager = ClusterManager(mock_sync_config) @@ -2201,6 +2243,7 @@ def test_manage_cluster_boto3( ) cluster_manager._instance_manager._store_assigned_hostnames = mocker.MagicMock() cluster_manager._instance_manager._update_dns_hostnames = mocker.MagicMock() + mocker.patch.object(cluster_manager._capacity_block_manager, "get_reserved_nodenames", return_value=[]) cluster_manager.manage_cluster() assert_that(caplog.records).is_length(len(expected_error_messages)) @@ -2365,6 +2408,9 @@ def test_increase_partitions_protected_failure_count(nodes, initial_map, expecte insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) cluster_manager._partitions_protected_failure_count_map = initial_map @@ -2390,6 +2436,9 @@ def test_reset_partition_failure_count(mocker, partition, expected_map): insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) cluster_manager._partitions_protected_failure_count_map = {"queue1": 2, "queue2": 1} @@ -2461,6 +2510,9 @@ def test_handle_protected_mode_process( insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) caplog.set_level(logging.INFO) cluster_manager = ClusterManager(mock_sync_config) @@ -2515,6 +2567,9 @@ def test_enter_protected_mode( insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) mock_update_compute_fleet_status = mocker.patch.object(cluster_manager, "_update_compute_fleet_status") @@ -2593,6 +2648,9 @@ def test_is_node_being_replaced(current_replacing_nodes, node, instance, current insufficient_capacity_timeout=3, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) cluster_manager._current_time = current_time @@ -2652,6 +2710,9 @@ def test_is_node_replacement_timeout(node, current_node_in_replacement, is_repla insufficient_capacity_timeout=-2.2, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) cluster_manager._current_time = datetime(2020, 1, 2, 0, 0, 0) @@ -2721,6 +2782,9 @@ def test_handle_failed_health_check_nodes_in_replacement( insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) cluster_manager._static_nodes_in_replacement = current_nodes_in_replacement @@ -2782,6 +2846,9 @@ def test_handle_bootstrap_failure_nodes( insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) for node, instance in zip(active_nodes, instances): @@ -2846,6 +2913,9 @@ def test_find_bootstrap_failure_nodes(active_nodes, instances): insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) for node, instance in zip(active_nodes, instances): @@ -3021,6 +3091,9 @@ def test_handle_ice_nodes( insufficient_capacity_timeout=600, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) caplog.set_level(logging.INFO) cluster_manager = ClusterManager(mock_sync_config) @@ -3369,6 +3442,9 @@ def test_reset_timeout_expired_compute_resources( insufficient_capacity_timeout=20, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(config) cluster_manager._current_time = datetime(2021, 1, 2, 0, 0, 0) @@ -3722,6 +3798,9 @@ def test_find_unhealthy_slurm_nodes( disable_nodes_on_insufficient_capacity=disable_nodes_on_insufficient_capacity, cluster_name="cluster", head_node_instance_id="i-instance-id", + region="region", + boto3_config=None, + fleet_config={}, ) cluster_manager = ClusterManager(mock_sync_config) # Run test From c4ab8d18080e0f3cbc4e8a44e8e82bbd1812f912 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 25 Oct 2023 12:56:22 +0200 Subject: [PATCH 10/35] Cleanup all the leftover slurm reservations even if not related to given nodes Previously the code was only deleting leftover slurm reservations from the given nodes, but if the CB is moved from a queue to another the list of nodes might not contain all the reservation. With this new logic we're retrieving all the slurm reservations from slurm and deleting them if they are no longer associated with existing CBs in the config. Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 48 +++++++------- .../test_capacity_block_manager.py | 66 ++++--------------- 2 files changed, 37 insertions(+), 77 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 0856ba9d8..8bb4eaaac 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -17,6 +17,7 @@ create_slurm_reservation, delete_slurm_reservation, does_slurm_reservation_exist, + get_slurm_reservations_info, update_slurm_reservation, ) from common.time_utils import seconds_to_minutes @@ -142,9 +143,8 @@ def get_reserved_nodenames(self, nodes: List[SlurmNode]): reserved_nodenames.extend(self._update_slurm_reservation(capacity_block)) self._reserved_nodenames = reserved_nodenames - # delete slurm reservations created by CapacityBlockManager not associated to existing capacity blocks, - # by checking slurm reservation info of the given nodes - self._cleanup_leftover_slurm_reservations(nodes) + # delete slurm reservations created by CapacityBlockManager not associated to existing capacity blocks + self._cleanup_leftover_slurm_reservations() return self._reserved_nodenames @@ -161,28 +161,28 @@ def _associate_nodenames_to_capacity_blocks(self, nodes: List[SlurmNode]): capacity_block.add_nodename(node.name) break - def _cleanup_leftover_slurm_reservations(self, nodes: List[StaticNode]): - """Find list of slurm reservations associated to the nodes not part of the configured CBs.""" - for node in nodes: - if node.reservation_name: - # nodes with a slurm reservation already in place - slurm_reservation_name = node.reservation_name - if CapacityBlock.is_capacity_block_slurm_reservation(slurm_reservation_name): - capacity_block_id = CapacityBlock.slurm_reservation_name_to_id(slurm_reservation_name) - if capacity_block_id not in self._capacity_blocks.keys(): - logger.info( - ( - "Found leftover slurm reservation %s. " - "Related Capacity Block %s is no longer in the cluster configuration. Deleting it." - ), - slurm_reservation_name, - capacity_block_id, - ) - delete_slurm_reservation(name=slurm_reservation_name) - else: - logger.debug( - "Slurm reservation %s is not managed by ParallelCluster. Skipping it.", slurm_reservation_name + def _cleanup_leftover_slurm_reservations(self): + """Find list of slurm reservations created by ParallelCluster but not part of the configured CBs.""" + slurm_reservations = get_slurm_reservations_info() + for slurm_reservation in slurm_reservations: + if CapacityBlock.is_capacity_block_slurm_reservation(slurm_reservation.name): + capacity_block_id = CapacityBlock.slurm_reservation_name_to_id(slurm_reservation.name) + if capacity_block_id not in self._capacity_blocks.keys(): + logger.info( + ( + "Found leftover slurm reservation %s for nodes %s. " + "Related Capacity Block %s is no longer in the cluster configuration. " + "Deleting the slurm reservation." + ), + slurm_reservation.name, + slurm_reservation.nodes, + capacity_block_id, ) + delete_slurm_reservation(name=slurm_reservation.name) + else: + logger.debug( + "Slurm reservation %s is not managed by ParallelCluster. Skipping it.", slurm_reservation.name + ) @staticmethod def _update_slurm_reservation(capacity_block: CapacityBlock): diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index d981022a5..095de2d81 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -15,7 +15,7 @@ import pytest from assertpy import assert_that from slurm_plugin.capacity_block_manager import SLURM_RESERVATION_NAME_PREFIX, CapacityBlock, CapacityBlockManager -from slurm_plugin.slurm_resources import StaticNode +from slurm_plugin.slurm_resources import SlurmReservation, StaticNode from aws.ec2 import CapacityBlockReservationInfo @@ -143,63 +143,22 @@ def test_associate_nodenames_to_capacity_blocks( ) @pytest.mark.parametrize( - ("nodes", "expected_leftover_slurm_reservations"), + ("slurm_reservations", "expected_leftover_slurm_reservations"), [ ( [ - # node without a slurm reservation -> skipped - StaticNode("queue1-st-compute-resource1-1", "ip-1", "hostname-1", "some_state", "queue1"), - # node with a reservation from the customer -> skipped - StaticNode( - "queue4-st-compute-resource1-1", - "ip-1", - "hostname-1", - "some_state", - "queue4", - reservation_name="other-reservation", - ), - # node associated with CB, not yet part of slurm reservation -> skipped - StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb"), - # node with a reservation associated with CB -> skipped - StaticNode( - "queue-cb-st-compute-resource-cb-2", - "ip-1", - "hostname-1", - "some_state", - "queue-cb", - reservation_name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-123456", - ), - # node with a reservation associated with an old CB but in the queue/cr associated to a new CB, - # this is a leftover reservation - StaticNode( - "queue-cb-st-compute-resource-cb-3", - "ip-1", - "hostname-1", - "some_state", - "queue-cb", - reservation_name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-876543", - ), - # node with a reservation associated with existing CB, - # but from an older queue/cr no longer in config -> skipped - StaticNode( - "queue2-st-compute-resource1-1", - "ip-1", - "hostname-1", - "some_state", - "queue2", - reservation_name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-123456", + # reservation from the customer -> skipped + SlurmReservation(name="other_reservation", state="active", users="anyone", nodes="node1"), + # reservation associated with existing CB -> skipped + SlurmReservation( + name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-123456", state="active", users="anyone", nodes="node1" ), # node associated with another old CB, no longer in the config, this is a leftover reservation - StaticNode( - "queue3-st-compute-resource1-1", - "ip-1", - "hostname-1", - "some_state", - "queue3", - reservation_name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-987654", + SlurmReservation( + name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-987654", state="active", users="anyone", nodes="node1" ), ], - [f"{SLURM_RESERVATION_NAME_PREFIX}cr-876543", f"{SLURM_RESERVATION_NAME_PREFIX}cr-987654"], + [f"{SLURM_RESERVATION_NAME_PREFIX}cr-987654"], ) ], ) @@ -208,13 +167,14 @@ def test_cleanup_leftover_slurm_reservations( mocker, capacity_block_manager, capacity_block, - nodes, + slurm_reservations, expected_leftover_slurm_reservations, ): # only cr-123456, queue-cb, compute-resource-cb is in the list of capacity blocks from config capacity_block_manager._capacity_blocks = {"cr-123456": capacity_block} + mocker.patch("slurm_plugin.capacity_block_manager.get_slurm_reservations_info", return_value=slurm_reservations) delete_res_mock = mocker.patch("slurm_plugin.capacity_block_manager.delete_slurm_reservation") - capacity_block_manager._cleanup_leftover_slurm_reservations(nodes) + capacity_block_manager._cleanup_leftover_slurm_reservations() # verify that only the slurm reservation associated with a CB, no longer in the config, # are considered as leftover From cb3dd14457fe0464133f6a5e169566e20c7cfee2 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Oct 2023 09:31:23 +0200 Subject: [PATCH 11/35] Rename does_slurm_reservation_exist to is_slurm_reservation Signed-off-by: Enrico Usai --- src/common/schedulers/slurm_reservation_commands.py | 2 +- src/slurm_plugin/capacity_block_manager.py | 4 ++-- .../common/schedulers/test_slurm_reservation_commands.py | 8 ++++---- tests/slurm_plugin/test_capacity_block_manager.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/common/schedulers/slurm_reservation_commands.py b/src/common/schedulers/slurm_reservation_commands.py index b8eb3fc46..cc264518f 100644 --- a/src/common/schedulers/slurm_reservation_commands.py +++ b/src/common/schedulers/slurm_reservation_commands.py @@ -149,7 +149,7 @@ def _add_param(cmd, param_name, value): return cmd -def does_slurm_reservation_exist( +def is_slurm_reservation( name: str, command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT, raise_on_error: bool = True, diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 8bb4eaaac..9a98ce5b3 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -16,8 +16,8 @@ from common.schedulers.slurm_reservation_commands import ( create_slurm_reservation, delete_slurm_reservation, - does_slurm_reservation_exist, get_slurm_reservations_info, + is_slurm_reservation, update_slurm_reservation, ) from common.time_utils import seconds_to_minutes @@ -212,7 +212,7 @@ def _log_cb_info(action_info): capacity_block_nodes = capacity_block.nodenames() capacity_block_nodenames = ",".join(capacity_block_nodes) - reservation_exists = does_slurm_reservation_exist(name=slurm_reservation_name) + reservation_exists = is_slurm_reservation(name=slurm_reservation_name) # if CB is active we need to remove Slurm reservation and start nodes if capacity_block.is_active(): # if Slurm reservation exists, delete it. diff --git a/tests/common/schedulers/test_slurm_reservation_commands.py b/tests/common/schedulers/test_slurm_reservation_commands.py index d3caaa15a..1d899d7fc 100644 --- a/tests/common/schedulers/test_slurm_reservation_commands.py +++ b/tests/common/schedulers/test_slurm_reservation_commands.py @@ -22,8 +22,8 @@ _parse_reservations_info, create_slurm_reservation, delete_slurm_reservation, - does_slurm_reservation_exist, get_slurm_reservations_info, + is_slurm_reservation, update_slurm_reservation, ) from slurm_plugin.slurm_resources import SlurmReservation @@ -288,7 +288,7 @@ def test_add_param(cmd, param_name, value, expected_cmd): ("root_1", CalledProcessError(1, "", "Generic error"), False, True, "Failed when retrieving"), ], ) -def test_does_slurm_reservation_exist( +def test_is_slurm_reservation( mocker, name, mocked_output, expected_output, expected_message, expected_exception, caplog ): caplog.set_level(logging.INFO) @@ -298,9 +298,9 @@ def test_does_slurm_reservation_exist( if expected_exception: with pytest.raises(CalledProcessError): - does_slurm_reservation_exist(name) + is_slurm_reservation(name) else: - assert_that(does_slurm_reservation_exist(name)).is_equal_to(expected_output) + assert_that(is_slurm_reservation(name)).is_equal_to(expected_output) if expected_message: assert_that(caplog.text).contains(expected_message) diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 095de2d81..347cd5444 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -218,7 +218,7 @@ def test_update_slurm_reservation( nodenames = ",".join(capacity_block.nodenames()) slurm_reservation_name = f"{SLURM_RESERVATION_NAME_PREFIX}{FAKE_CAPACITY_BLOCK_ID}" check_res_mock = mocker.patch( - "slurm_plugin.capacity_block_manager.does_slurm_reservation_exist", return_value=reservation_exists + "slurm_plugin.capacity_block_manager.is_slurm_reservation", return_value=reservation_exists ) create_res_mock = mocker.patch("slurm_plugin.capacity_block_manager.create_slurm_reservation") update_res_mock = mocker.patch("slurm_plugin.capacity_block_manager.update_slurm_reservation") From 9eefb806f78bfad8d1f8b8e257105d8515e17bf7 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Oct 2023 09:44:51 +0200 Subject: [PATCH 12/35] Define _is_time_to_update_capacity_blocks_info method and fix internal logic Return true if it's the moment to update capacity blocks info, from ec2 and from config. This is true when the CapacityBlockManager is not yet initialized `self._capacity_blocks_update_time == None` (this was missing before this patch) and every 10 minutes. Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 27 ++++++++++----- .../test_capacity_block_manager.py | 34 ++++++++++++++----- 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 9a98ce5b3..e9c73af8a 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -125,12 +125,8 @@ def ec2_client(self): def get_reserved_nodenames(self, nodes: List[SlurmNode]): """Manage nodes part of capacity block reservation. Returns list of reserved nodes.""" # evaluate if it's the moment to update info - is_time_to_update = ( - self._capacity_blocks_update_time - and seconds_to_minutes(datetime.now(tz=timezone.utc) - self._capacity_blocks_update_time) - > CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD - ) - if is_time_to_update: # TODO: evaluate time to update accordingly to capacity block start time + now = datetime.now(tz=timezone.utc) + if self._is_time_to_update_capacity_blocks_info(now): reserved_nodenames = [] # update capacity blocks details from ec2 (e.g. state) @@ -146,8 +142,24 @@ def get_reserved_nodenames(self, nodes: List[SlurmNode]): # delete slurm reservations created by CapacityBlockManager not associated to existing capacity blocks self._cleanup_leftover_slurm_reservations() + self._capacity_blocks_update_time = now + return self._reserved_nodenames + def _is_time_to_update_capacity_blocks_info(self, now: datetime): + """ + Return true if it's the moment to update capacity blocks info, from ec2 and from config. + + This is true when the CapacityBlockManager is not yet initialized (self._capacity_blocks_update_time == None) + and every 10 minutes. + # TODO: evaluate time to update accordingly to capacity block start time + """ + return self._capacity_blocks_update_time is None or ( + self._capacity_blocks_update_time + and seconds_to_minutes((now - self._capacity_blocks_update_time).total_seconds()) + > CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD + ) + def _associate_nodenames_to_capacity_blocks(self, nodes: List[SlurmNode]): """ Update capacity_block info adding nodenames list. @@ -247,7 +259,6 @@ def _update_capacity_blocks_info_from_ec2(self): This method is called every time the CapacityBlockManager is re-initialized, so when it starts/is restarted or when fleet configuration changes. """ - # Retrieve updated capacity reservation information at initialization time, and every tot minutes self._capacity_blocks = self._capacity_blocks_from_config() if self._capacity_blocks: @@ -264,8 +275,6 @@ def _update_capacity_blocks_info_from_ec2(self): capacity_block_id = capacity_block_reservation_info.capacity_reservation_id() self._capacity_blocks[capacity_block_id].update_ec2_info(capacity_block_reservation_info) - self._capacity_blocks_update_time = datetime.now(tz=timezone.utc) - def _capacity_blocks_from_config(self): """ Collect list of capacity reservation target from all queues/compute-resources in the fleet config. diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 347cd5444..b75b8fb16 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -102,6 +102,29 @@ def test_ec2_client(self, capacity_block_manager, mocker): capacity_block_manager.ec2_client() ec2_mock.assert_called_once() + @pytest.mark.parametrize( + ("previous_capacity_blocks_update_time", "expected_update_time"), + [ + # manager not initialized + (None, True), + # delta < CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD + (datetime(2020, 1, 2, 1, 51, 0), False), + (datetime(2020, 1, 2, 1, 50, 0), False), + # delta >= CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD + (datetime(2020, 1, 2, 1, 40, 0), True), + (datetime(2020, 1, 2, 0, 51, 0), True), + (datetime(2020, 1, 1, 0, 51, 0), True), + ], + ) + def test_is_time_to_update_capacity_blocks_info( + self, mocker, capacity_block_manager, previous_capacity_blocks_update_time, expected_update_time + ): + mocked_now = datetime(2020, 1, 2, 1, 51, 0) + mocker.patch("slurm_plugin.capacity_block_manager.datetime").now.return_value = mocked_now + + capacity_block_manager._capacity_blocks_update_time = previous_capacity_blocks_update_time + assert_that(capacity_block_manager._is_time_to_update_capacity_blocks_info(mocked_now)) + @pytest.mark.parametrize( ("capacity_blocks", "nodes", "expected_nodenames_in_capacity_block"), [ @@ -257,11 +280,10 @@ def test_update_slurm_reservation( "capacity_blocks_from_config", "capacity_blocks_info_from_ec2", "expected_new_capacity_blocks", - "expected_new_update_time", ), [ - # nothing in the config, just change update time - ({}, {}, [], {}, True), + # nothing in the config + ({}, {}, [], {}), # new config without info, remove old block from the map ( { @@ -270,7 +292,6 @@ def test_update_slurm_reservation( {}, [], {}, - True, ), # update old values with new values ( @@ -293,7 +314,6 @@ def test_update_slurm_reservation( "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), "cr-234567": CapacityBlock("id2", "queue-cb2", "compute-resource-cb2"), }, - True, ), ], ) @@ -305,13 +325,10 @@ def test_update_capacity_blocks_info_from_ec2( init_capacity_blocks, expected_new_capacity_blocks, capacity_blocks_info_from_ec2, - expected_new_update_time, ): mocker.patch.object( capacity_block_manager, "_capacity_blocks_from_config", return_value=capacity_blocks_from_config ) - mocked_now = datetime(2020, 1, 1, 0, 0, 0) - mocker.patch("slurm_plugin.capacity_block_manager.datetime").now.return_value = mocked_now capacity_block_manager._capacity_blocks = init_capacity_blocks mocked_client = mocker.MagicMock() @@ -321,7 +338,6 @@ def test_update_capacity_blocks_info_from_ec2( capacity_block_manager._update_capacity_blocks_info_from_ec2() assert_that(expected_new_capacity_blocks).is_equal_to(capacity_block_manager._capacity_blocks) - assert_that(capacity_block_manager._capacity_blocks_update_time).is_equal_to(mocked_now) if expected_new_capacity_blocks: # verify that all the blocks have the updated info from ec2 assert_that( From db71c505daab9c93f69ab6808ba009409d9633ec Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Oct 2023 11:18:53 +0200 Subject: [PATCH 13/35] Add robustness when updating info from ec2 Even if this should never happen, the code is now able to detect if we're trying to update info from capacity blocks not in the internal map. Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index e9c73af8a..9cd3164d3 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -32,6 +32,13 @@ SLURM_RESERVATION_NAME_PREFIX = "pcluster-" +class CapacityBlockManagerError(Exception): + """Represent an error during the execution of an action with the CapacityBlockManager.""" + + def __init__(self, message: str): + super().__init__(message) + + class CapacityType(Enum): """Enum to identify the type compute supported by the queues.""" @@ -271,9 +278,14 @@ def _update_capacity_blocks_info_from_ec2(self): CapacityBlockReservationInfo ] = self.ec2_client().describe_capacity_reservations(capacity_block_ids) - for capacity_block_reservation_info in capacity_block_reservations_info: - capacity_block_id = capacity_block_reservation_info.capacity_reservation_id() + for capacity_block_reservation_info in capacity_block_reservations_info: + capacity_block_id = capacity_block_reservation_info.capacity_reservation_id() + try: self._capacity_blocks[capacity_block_id].update_ec2_info(capacity_block_reservation_info) + except KeyError: + raise CapacityBlockManagerError( + f"Unable to find capacity block {capacity_block_id} in the internal map" + ) def _capacity_blocks_from_config(self): """ From 15f38f5b14bfb9436f455adb4a2417e93d82a52f Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Oct 2023 11:19:29 +0200 Subject: [PATCH 14/35] Improve main method of CapacityBlockManager Now the update of capacity_blocks info from config, from ec2 and from nodes is done in the same method. _update_slurm_reservation is now just updating reservations, the collection of reserved nodes is done upstream. Add unit tests to ensure the following items are correct: - update time attribute - internal capacity blocks list attribute - reserved nodes are only updated if there are no internal errors and only include nodes from not active CBs. Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 53 +++-- .../test_capacity_block_manager.py | 215 ++++++++++++++---- 2 files changed, 203 insertions(+), 65 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 9cd3164d3..16bec440e 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -136,14 +136,24 @@ def get_reserved_nodenames(self, nodes: List[SlurmNode]): if self._is_time_to_update_capacity_blocks_info(now): reserved_nodenames = [] - # update capacity blocks details from ec2 (e.g. state) - self._update_capacity_blocks_info_from_ec2() - # associate nodenames to capacity blocks, according to queues and compute resources from fleet configuration - self._associate_nodenames_to_capacity_blocks(nodes) + # update list of capacity blocks from fleet config + self._capacity_blocks = self._capacity_blocks_from_config() + + if self._capacity_blocks: + # update capacity blocks details from ec2 (e.g. state) + self._update_capacity_blocks_info_from_ec2() + # associate nodenames to capacity blocks, + # according to queues and compute resources from fleet configuration + self._associate_nodenames_to_capacity_blocks(nodes) + + # create, update or delete slurm reservation for the nodes according to CB details. + for capacity_block in self._capacity_blocks.values(): + self._update_slurm_reservation(capacity_block) + + # If CB is in not yet active or expired add nodes to list of reserved nodes + if not capacity_block.is_active(): + reserved_nodenames.extend(capacity_block.nodenames()) - # create, update or delete slurm reservation for the nodes according to CB details. - for capacity_block in self._capacity_blocks.values(): - reserved_nodenames.extend(self._update_slurm_reservation(capacity_block)) self._reserved_nodenames = reserved_nodenames # delete slurm reservations created by CapacityBlockManager not associated to existing capacity blocks @@ -210,8 +220,6 @@ def _update_slurm_reservation(capacity_block: CapacityBlock): A CB has five possible states: payment-pending, pending, active, expired and payment-failed, we need to create/delete Slurm reservation accordingly. - - returns list of nodes reserved for that capacity block, if it's not active. """ def _log_cb_info(action_info): @@ -224,12 +232,9 @@ def _log_cb_info(action_info): capacity_block_nodenames, ) - nodes_in_slurm_reservation = [] - # retrieve list of nodes associated to a given slurm reservation/capacity block slurm_reservation_name = capacity_block.slurm_reservation_name() - capacity_block_nodes = capacity_block.nodenames() - capacity_block_nodenames = ",".join(capacity_block_nodes) + capacity_block_nodenames = ",".join(capacity_block.nodenames()) reservation_exists = is_slurm_reservation(name=slurm_reservation_name) # if CB is active we need to remove Slurm reservation and start nodes @@ -244,7 +249,6 @@ def _log_cb_info(action_info): # if CB is expired or not active we need to (re)create Slurm reservation # to avoid considering nodes as unhealthy else: - nodes_in_slurm_reservation = capacity_block_nodes # create or update Slurm reservation if reservation_exists: _log_cb_info("Updating existing related") @@ -257,8 +261,6 @@ def _log_cb_info(action_info): nodes=capacity_block_nodenames, ) - return nodes_in_slurm_reservation - def _update_capacity_blocks_info_from_ec2(self): """ Store in the _capacity_reservations a dict for CapacityReservation info. @@ -266,17 +268,14 @@ def _update_capacity_blocks_info_from_ec2(self): This method is called every time the CapacityBlockManager is re-initialized, so when it starts/is restarted or when fleet configuration changes. """ - self._capacity_blocks = self._capacity_blocks_from_config() - - if self._capacity_blocks: - capacity_block_ids = self._capacity_blocks.keys() - logger.info( - "Retrieving updated Capacity Block reservation information from EC2 for %s", - ",".join(capacity_block_ids), - ) - capacity_block_reservations_info: List[ - CapacityBlockReservationInfo - ] = self.ec2_client().describe_capacity_reservations(capacity_block_ids) + capacity_block_ids = self._capacity_blocks.keys() + logger.info( + "Retrieving updated Capacity Block reservation information from EC2 for %s", + ",".join(capacity_block_ids), + ) + capacity_block_reservations_info: List[ + CapacityBlockReservationInfo + ] = self.ec2_client().describe_capacity_reservations(capacity_block_ids) for capacity_block_reservation_info in capacity_block_reservations_info: capacity_block_id = capacity_block_reservation_info.capacity_reservation_id() diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index b75b8fb16..76fdc1e18 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -14,8 +14,13 @@ import pytest from assertpy import assert_that -from slurm_plugin.capacity_block_manager import SLURM_RESERVATION_NAME_PREFIX, CapacityBlock, CapacityBlockManager -from slurm_plugin.slurm_resources import SlurmReservation, StaticNode +from slurm_plugin.capacity_block_manager import ( + SLURM_RESERVATION_NAME_PREFIX, + CapacityBlock, + CapacityBlockManager, + CapacityBlockManagerError, +) +from slurm_plugin.slurm_resources import DynamicNode, SlurmReservation, StaticNode from aws.ec2 import CapacityBlockReservationInfo @@ -102,6 +107,137 @@ def test_ec2_client(self, capacity_block_manager, mocker): capacity_block_manager.ec2_client() ec2_mock.assert_called_once() + @pytest.mark.parametrize( + ( + "is_time_to_update", + "previous_reserved_nodenames", + "capacity_blocks_from_config", + "capacity_blocks_info_from_ec2", + "nodes", + "expected_reserved_nodenames", + ), + [ + ( + # no time to update, preserve old nodenames + False, + ["node1"], + {}, + [], + [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + ["node1"], + ), + ( + # capacity block from config is empty, remove old nodenames + True, + ["node1"], + {}, + [], + [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + [], # empty because capacity block from config is empty + ), + ( + # preserve old value because there is an exception + True, + ["node1"], + { + "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), + }, + [ + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} + ), + # add another id not in the capacity block to trigger an exception + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-234567"} + ), + ], + [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + ["node1"], + ), + ( + True, + ["node1"], + { + "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), + "cr-234567": CapacityBlock("cr-234567", "queue-cb2", "compute-resource-cb2"), + "cr-345678": CapacityBlock("cr-345678", "queue-cb3", "compute-resource-cb3"), + }, + [ + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} + ), + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-234567"} + ), + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-345678"} + ), + ], + [ + StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb"), + DynamicNode("queue-cb2-dy-compute-resource-cb2-1", "ip-1", "hostname-1", "some_state", "queue-cb2"), + DynamicNode("queue-cb3-dy-compute-resource-cb3-1", "ip-1", "hostname-1", "some_state", "queue-cb3"), + ], + ["queue-cb-st-compute-resource-cb-1", "queue-cb3-dy-compute-resource-cb3-1"], + ), + ], + ) + def test_get_reserved_nodenames( + self, + mocker, + capacity_block_manager, + is_time_to_update, + previous_reserved_nodenames, + capacity_blocks_from_config, + capacity_blocks_info_from_ec2, + nodes, + expected_reserved_nodenames, + ): + mocker.patch.object( + capacity_block_manager, "_is_time_to_update_capacity_blocks_info", return_value=is_time_to_update + ) + mocker.patch.object( + capacity_block_manager, "_capacity_blocks_from_config", return_value=capacity_blocks_from_config + ) + mocked_client = mocker.MagicMock() + mocked_client.return_value.describe_capacity_reservations.return_value = capacity_blocks_info_from_ec2 + capacity_block_manager._ec2_client = mocked_client + update_res_mock = mocker.patch.object(capacity_block_manager, "_update_slurm_reservation") + cleanup_mock = mocker.patch.object(capacity_block_manager, "_cleanup_leftover_slurm_reservations") + capacity_block_manager._reserved_nodenames = previous_reserved_nodenames + previous_update_time = datetime(2020, 1, 1, 0, 0, 0) + capacity_block_manager._capacity_blocks_update_time = previous_update_time + + # use a trick to trigger an internal exception + expected_exception = len(capacity_blocks_from_config) != len(capacity_blocks_info_from_ec2) + if expected_exception: + with pytest.raises(CapacityBlockManagerError): + capacity_block_manager.get_reserved_nodenames(nodes) + assert_that(capacity_block_manager._reserved_nodenames).is_equal_to(expected_reserved_nodenames) + else: + reserved_nodenames = capacity_block_manager.get_reserved_nodenames(nodes) + assert_that(reserved_nodenames).is_equal_to(expected_reserved_nodenames) + assert_that(capacity_block_manager._reserved_nodenames).is_equal_to(expected_reserved_nodenames) + + if is_time_to_update: + if expected_exception: + assert_that(capacity_block_manager._capacity_blocks_update_time).is_equal_to(previous_update_time) + update_res_mock.assert_not_called() + cleanup_mock.assert_not_called() + else: + assert_that(capacity_block_manager._capacity_blocks_update_time).is_not_equal_to(previous_update_time) + update_res_mock.assert_has_calls( + [call(capacity_block) for capacity_block in capacity_block_manager._capacity_blocks.values()], + any_order=True, + ) + cleanup_mock.assert_called_once() + assert_that(capacity_block_manager._capacity_blocks).is_equal_to(capacity_blocks_from_config) + + else: + assert_that(capacity_block_manager._capacity_blocks_update_time).is_equal_to(previous_update_time) + update_res_mock.assert_not_called() + cleanup_mock.assert_not_called() + @pytest.mark.parametrize( ("previous_capacity_blocks_update_time", "expected_update_time"), [ @@ -275,32 +411,30 @@ def test_update_slurm_reservation( assert_that(caplog.text).contains(msg_prefix + "Nothing to do. No existing" + msg_suffix) @pytest.mark.parametrize( - ( - "init_capacity_blocks", - "capacity_blocks_from_config", - "capacity_blocks_info_from_ec2", - "expected_new_capacity_blocks", - ), + ("init_capacity_blocks", "capacity_blocks_info_from_ec2", "expected_exception"), [ # nothing in the config - ({}, {}, [], {}), - # new config without info, remove old block from the map + ({}, [], None), + # exception, because trying to update a capacity block not in the list, keep previous values ( { - "cr-987654": CapacityBlock("id", "queue-cb", "compute-resource-cb"), + "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), }, - {}, - [], - {}, + [ + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} + ), + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} + ), + ], + "Unable to find capacity block cr-234567", ), # update old values with new values ( { - "cr-987654": CapacityBlock("id", "queue-cb", "compute-resource-cb"), - }, - { - "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), - "cr-234567": CapacityBlock("id2", "queue-cb2", "compute-resource-cb2"), + "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), + "cr-234567": CapacityBlock("cr-234567", "queue-cb", "compute-resource-cb"), }, [ CapacityBlockReservationInfo( @@ -310,10 +444,7 @@ def test_update_slurm_reservation( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} ), ], - { - "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), - "cr-234567": CapacityBlock("id2", "queue-cb2", "compute-resource-cb2"), - }, + None, ), ], ) @@ -321,25 +452,21 @@ def test_update_capacity_blocks_info_from_ec2( self, mocker, capacity_block_manager, - capacity_blocks_from_config, init_capacity_blocks, - expected_new_capacity_blocks, capacity_blocks_info_from_ec2, + expected_exception, ): - mocker.patch.object( - capacity_block_manager, "_capacity_blocks_from_config", return_value=capacity_blocks_from_config - ) capacity_block_manager._capacity_blocks = init_capacity_blocks mocked_client = mocker.MagicMock() mocked_client.return_value.describe_capacity_reservations.return_value = capacity_blocks_info_from_ec2 capacity_block_manager._ec2_client = mocked_client - capacity_block_manager._update_capacity_blocks_info_from_ec2() + if expected_exception: + with pytest.raises(CapacityBlockManagerError, match=expected_exception): + capacity_block_manager._update_capacity_blocks_info_from_ec2() - assert_that(expected_new_capacity_blocks).is_equal_to(capacity_block_manager._capacity_blocks) - if expected_new_capacity_blocks: - # verify that all the blocks have the updated info from ec2 + # assert that only existing item has been updated assert_that( capacity_block_manager._capacity_blocks.get("cr-123456")._capacity_block_reservation_info ).is_equal_to( @@ -347,13 +474,25 @@ def test_update_capacity_blocks_info_from_ec2( {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} ) ) - assert_that( - capacity_block_manager._capacity_blocks.get("cr-234567")._capacity_block_reservation_info - ).is_equal_to( - CapacityBlockReservationInfo( - {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} + assert_that(capacity_block_manager._capacity_blocks.get("cr-234567")).is_none() + else: + capacity_block_manager._update_capacity_blocks_info_from_ec2() + if init_capacity_blocks: + # verify that all the blocks have the updated info from ec2 + assert_that( + capacity_block_manager._capacity_blocks.get("cr-123456")._capacity_block_reservation_info + ).is_equal_to( + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} + ) + ) + assert_that( + capacity_block_manager._capacity_blocks.get("cr-234567")._capacity_block_reservation_info + ).is_equal_to( + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} + ) ) - ) @pytest.mark.parametrize( ("fleet_config", "expected_capacity_blocks", "expected_exception"), From dd7c93d448e208f4a50e067b20416141306d21df Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Oct 2023 12:34:45 +0200 Subject: [PATCH 15/35] Minor improvements to debugging messages Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 15 +++++++++++---- tests/slurm_plugin/test_capacity_block_manager.py | 6 +++--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 16bec440e..3c05002b2 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -184,7 +184,6 @@ def _associate_nodenames_to_capacity_blocks(self, nodes: List[SlurmNode]): Check configured CBs and associate nodes to them according to queue and compute resource info. """ for node in nodes: - capacity_block: CapacityBlock for capacity_block in self._capacity_blocks.values(): if capacity_block.does_node_belong_to(node): capacity_block.add_nodename(node.name) @@ -208,6 +207,14 @@ def _cleanup_leftover_slurm_reservations(self): capacity_block_id, ) delete_slurm_reservation(name=slurm_reservation.name) + else: + logger.debug( + ( + "Slurm reservation %s is managed by ParallelCluster " + "and related to an existing Capacity Block. Skipping it." + ), + slurm_reservation.name, + ) else: logger.debug( "Slurm reservation %s is not managed by ParallelCluster. Skipping it.", slurm_reservation.name @@ -241,7 +248,7 @@ def _log_cb_info(action_info): if capacity_block.is_active(): # if Slurm reservation exists, delete it. if reservation_exists: - _log_cb_info("Deleting related") + _log_cb_info("Deleting") delete_slurm_reservation(name=slurm_reservation_name) else: _log_cb_info("Nothing to do. No existing") @@ -251,10 +258,10 @@ def _log_cb_info(action_info): else: # create or update Slurm reservation if reservation_exists: - _log_cb_info("Updating existing related") + _log_cb_info("Updating existing") update_slurm_reservation(name=slurm_reservation_name, nodes=capacity_block_nodenames) else: - _log_cb_info("Creating related") + _log_cb_info("Creating") create_slurm_reservation( name=slurm_reservation_name, start_time=datetime.now(tz=timezone.utc), diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 76fdc1e18..9dbb02dad 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -397,15 +397,15 @@ def test_update_slurm_reservation( create_res_mock.assert_called_with( name=slurm_reservation_name, start_time=expected_start_time, nodes=nodenames ) - assert_that(caplog.text).contains(msg_prefix + "Creating related" + msg_suffix) + assert_that(caplog.text).contains(msg_prefix + "Creating" + msg_suffix) if expected_update_res_call: update_res_mock.assert_called_with(name=slurm_reservation_name, nodes=nodenames) - assert_that(caplog.text).contains(msg_prefix + "Updating existing related" + msg_suffix) + assert_that(caplog.text).contains(msg_prefix + "Updating existing" + msg_suffix) # when state is active if expected_delete_res_call: delete_res_mock.assert_called_with(name=slurm_reservation_name) - assert_that(caplog.text).contains(msg_prefix + "Deleting related" + msg_suffix) + assert_that(caplog.text).contains(msg_prefix + "Deleting" + msg_suffix) if state == "active" and not reservation_exists: assert_that(caplog.text).contains(msg_prefix + "Nothing to do. No existing" + msg_suffix) From d1deabe786a73e059d0b0af40ff43d0c8312d5d4 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Oct 2023 14:46:26 +0200 Subject: [PATCH 16/35] Move management of capacity blocks in a common place for static and dynamic nodes Previously the logic was only applied to static nodes, now the list of reserved nodes is evaluated for all the nodes. Do not add reserved nodes to the list of all unhealthy nodes. Added a new configuration parameter disable_capacity_blocks_management. Signed-off-by: Enrico Usai --- src/slurm_plugin/clustermgtd.py | 31 +++++++-- tests/slurm_plugin/test_clustermgtd.py | 87 +++++++++++++++++++++++++- 2 files changed, 110 insertions(+), 8 deletions(-) diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py index 89758f54f..810d2fdfb 100644 --- a/src/slurm_plugin/clustermgtd.py +++ b/src/slurm_plugin/clustermgtd.py @@ -155,6 +155,7 @@ class ClustermgtdConfig: "disable_all_cluster_management": False, "health_check_timeout": 180, "health_check_timeout_after_slurmdstarttime": 180, + "disable_capacity_blocks_management": False, # DNS domain configs "hosted_zone": None, "dns_domain": None, @@ -270,6 +271,11 @@ def _get_health_check_config(self, config): "disable_all_health_checks", fallback=(self.disable_ec2_health_check and self.disable_scheduled_event_health_check), ) + self.disable_capacity_blocks_management = config.getboolean( + "clustermgtd", + "disable_capacity_blocks_management", + fallback=self.DEFAULTS.get("disable_capacity_block_management"), + ) def _get_terminate_config(self, config): """Get config option related to instance termination and node replacement.""" @@ -734,8 +740,26 @@ def _find_unhealthy_slurm_nodes(self, slurm_nodes): unhealthy_dynamic_nodes = [] ice_compute_resources_and_nodes_map = {} all_unhealthy_nodes = [] + + # Remove the nodes part of unactive Capacity Blocks from the list of unhealthy nodes. + # Nodes from active Capacity Blocks will be instead managed as unhealthy instances. + reserved_nodenames = [] + if not self._config.disable_capacity_blocks_management: + reserved_nodenames = self._capacity_block_manager.get_reserved_nodenames(slurm_nodes) + log.info( + ( + "The nodes %s are associated to unactive Capacity Blocks, " + "they will not be considered as unhealthy nodes." + ), + ",".join(reserved_nodenames), + ) + for node in slurm_nodes: if not node.is_healthy(self._config.terminate_drain_nodes, self._config.terminate_down_nodes): + if not self._config.disable_capacity_blocks_management and node.name in reserved_nodenames: + # do not consider as unhealthy the nodes reserved for capacity blocks + continue + all_unhealthy_nodes.append(node) if isinstance(node, StaticNode): @@ -829,12 +853,7 @@ def _handle_unhealthy_static_nodes(self, unhealthy_static_nodes): except Exception as e: log.error("Encountered exception when retrieving console output from unhealthy static nodes: %s", e) - # Remove the nodes part of the unactive Capacity Block reservations from the list of unhealthy nodes. - # nodes from active Capacity Block reservation will be instead managed as standard static instances. - reserved_unhealthy_nodenames = self._capacity_block_manager.get_reserved_nodenames(unhealthy_static_nodes) - log.info("Removing reserved nodes %s from list of unhealthy nodes", ",".join(reserved_unhealthy_nodenames)) - node_list = [node.name for node in unhealthy_static_nodes if node.name not in reserved_unhealthy_nodenames] - + node_list = [node.name for node in unhealthy_static_nodes] # Set nodes into down state so jobs can be requeued immediately try: log.info("Setting unhealthy static nodes to DOWN") diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py index 17bb646f3..35d4142b2 100644 --- a/tests/slurm_plugin/test_clustermgtd.py +++ b/tests/slurm_plugin/test_clustermgtd.py @@ -1346,6 +1346,7 @@ def test_maintain_nodes( terminate_down_nodes=True, insufficient_capacity_timeout=20, disable_nodes_on_insufficient_capacity=disable_nodes_on_insufficient_capacity, + disable_capacity_blocks_management=False, cluster_name="cluster", head_node_instance_id="i-instance-id", region="region", @@ -1356,6 +1357,9 @@ def test_maintain_nodes( cluster_manager._static_nodes_in_replacement = static_nodes_in_replacement cluster_manager._current_time = datetime(2020, 1, 2, 0, 0, 0) cluster_manager._config.node_replacement_timeout = 30 + mock_reserved_nodes = mocker.patch.object( + cluster_manager._capacity_block_manager, "get_reserved_nodenames", return_value=[] + ) mock_update_replacement = mocker.patch.object(cluster_manager, "_update_static_nodes_in_replacement", autospec=True) mock_handle_dynamic = mocker.patch.object(cluster_manager, "_handle_unhealthy_dynamic_nodes", autospec=True) mock_handle_static = mocker.patch.object(cluster_manager, "_handle_unhealthy_static_nodes", autospec=True) @@ -1375,6 +1379,7 @@ def test_maintain_nodes( # Run test cluster_manager._maintain_nodes(partitions, {}) # Check function calls + mock_reserved_nodes.assert_called_with(active_nodes) mock_update_replacement.assert_called_with(active_nodes) mock_handle_dynamic.assert_called_with(expected_unhealthy_dynamic_nodes) mock_handle_static.assert_called_with(expected_unhealthy_static_nodes) @@ -3563,8 +3568,11 @@ def test_set_ice_compute_resources_to_down( @pytest.mark.parametrize( - "active_nodes, expected_unhealthy_dynamic_nodes, expected_unhealthy_static_nodes, " - "expected_ice_compute_resources_and_nodes_map, disable_nodes_on_insufficient_capacity", + ( + "active_nodes, reserved_nodenames, expected_unhealthy_dynamic_nodes, expected_unhealthy_static_nodes, " + "expected_ice_compute_resources_and_nodes_map, disable_nodes_on_insufficient_capacity, " + "disable_capacity_blocks_management" + ), [ ( [ @@ -3624,6 +3632,7 @@ def test_set_ice_compute_resources_to_down( "[root@2023-01-31T21:24:55]", ), ], + [], [ DynamicNode( "queue1-dy-c5xlarge-2", "ip-2", "hostname", "IDLE+CLOUD+POWERING_DOWN", "queue1" @@ -3677,6 +3686,7 @@ def test_set_ice_compute_resources_to_down( }, }, True, + False, ), ( [ @@ -3735,7 +3745,24 @@ def test_set_ice_compute_resources_to_down( "(Code:InsufficientHostCapacity)Temporarily disabling node due to insufficient capacity " "[root@2023-01-31T21:24:55]", ), + StaticNode( + "queue4-st-c5xlarge-1", + "ip-1", + "hostname", + "DOWN+CLOUD+MAINTENANCE+RESERVED", + "queue4", + reservation_name="cr-123456", + ), # reserved static + DynamicNode( + "queue4-dy-c5xlarge-1", + "ip-1", + "hostname", + "DOWN+CLOUD+MAINTENANCE+RESERVED", + "queue4", + reservation_name="cr-234567", + ), # reserved dynamic ], + ["queue4-st-c5xlarge-1", "queue4-dy-c5xlarge-1"], [ DynamicNode( "queue1-dy-c5xlarge-2", "ip-2", "hostname", "IDLE+CLOUD+POWERING_DOWN", "queue1" @@ -3779,6 +3806,51 @@ def test_set_ice_compute_resources_to_down( ], {}, False, + False, + ), + ( + [ + StaticNode( + "queue4-st-c5xlarge-1", + "ip-1", + "hostname", + "DOWN+CLOUD+MAINTENANCE+RESERVED", + "queue4", + reservation_name="cr-123456", + ), # reserved static + DynamicNode( + "queue4-dy-c5xlarge-1", + "ip-1", + "hostname", + "DOWN+CLOUD+MAINTENANCE+RESERVED", + "queue4", + reservation_name="cr-234567", + ), # reserved dynamic + ], + ["queue4-st-c5xlarge-1", "queue4-dy-c5xlarge-1"], + [ + DynamicNode( + "queue4-dy-c5xlarge-1", + "ip-1", + "hostname", + "DOWN+CLOUD+MAINTENANCE+RESERVED", + "queue4", + reservation_name="cr-234567", + ), + ], + [ + StaticNode( + "queue4-st-c5xlarge-1", + "ip-1", + "hostname", + "DOWN+CLOUD+MAINTENANCE+RESERVED", + "queue4", + reservation_name="cr-123456", + ), + ], + {}, + False, + True, # disable_capacity_blocks_management ), ], ) @@ -3787,10 +3859,13 @@ def test_set_ice_compute_resources_to_down( ) def test_find_unhealthy_slurm_nodes( active_nodes, + reserved_nodenames, expected_unhealthy_dynamic_nodes, expected_unhealthy_static_nodes, expected_ice_compute_resources_and_nodes_map, disable_nodes_on_insufficient_capacity, + disable_capacity_blocks_management, + mocker, ): mock_sync_config = SimpleNamespace( terminate_drain_nodes=True, @@ -3801,8 +3876,12 @@ def test_find_unhealthy_slurm_nodes( region="region", boto3_config=None, fleet_config={}, + disable_capacity_blocks_management=disable_capacity_blocks_management, ) cluster_manager = ClusterManager(mock_sync_config) + get_reserved_mock = mocker.patch.object( + cluster_manager._capacity_block_manager, "get_reserved_nodenames", return_value=reserved_nodenames + ) # Run test ( unhealthy_dynamic_nodes, @@ -3813,6 +3892,10 @@ def test_find_unhealthy_slurm_nodes( assert_that(unhealthy_dynamic_nodes).is_equal_to(expected_unhealthy_dynamic_nodes) assert_that(unhealthy_static_nodes).is_equal_to(expected_unhealthy_static_nodes) assert_that(ice_compute_resources_and_nodes_map).is_equal_to(expected_ice_compute_resources_and_nodes_map) + if disable_capacity_blocks_management: + get_reserved_mock.assert_not_called() + else: + get_reserved_mock.assert_called() @pytest.mark.parametrize( From 4639b1c31b1541cecb2084b52b4da4ce2184a0d2 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Oct 2023 14:50:27 +0200 Subject: [PATCH 17/35] Rename _capacity_blocks_from_config to _retrieve_capacity_blocks_from_fleet_config Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 8 ++++---- tests/slurm_plugin/test_capacity_block_manager.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 3c05002b2..8b0ce4921 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -137,7 +137,7 @@ def get_reserved_nodenames(self, nodes: List[SlurmNode]): reserved_nodenames = [] # update list of capacity blocks from fleet config - self._capacity_blocks = self._capacity_blocks_from_config() + self._capacity_blocks = self._retrieve_capacity_blocks_from_fleet_config() if self._capacity_blocks: # update capacity blocks details from ec2 (e.g. state) @@ -270,7 +270,7 @@ def _log_cb_info(action_info): def _update_capacity_blocks_info_from_ec2(self): """ - Store in the _capacity_reservations a dict for CapacityReservation info. + Update capacity blocks in self._capacity_blocks by adding capacity reservation info. This method is called every time the CapacityBlockManager is re-initialized, so when it starts/is restarted or when fleet configuration changes. @@ -293,7 +293,7 @@ def _update_capacity_blocks_info_from_ec2(self): f"Unable to find capacity block {capacity_block_id} in the internal map" ) - def _capacity_blocks_from_config(self): + def _retrieve_capacity_blocks_from_fleet_config(self): """ Collect list of capacity reservation target from all queues/compute-resources in the fleet config. @@ -303,7 +303,7 @@ def _capacity_blocks_from_config(self): "my-compute-resource": { "Api": "create-fleet", "CapacityType": "on-demand|spot|capacity-block", - "AllocationStrategy": "lowest-price|capacity-optimized|use-capacity-reservations-first", + "AllocationStrategy": "lowest-price|capacity-optimized", "Instances": [ { "InstanceType": "p4d.24xlarge" } ], diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 9dbb02dad..6d0aef556 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -197,7 +197,9 @@ def test_get_reserved_nodenames( capacity_block_manager, "_is_time_to_update_capacity_blocks_info", return_value=is_time_to_update ) mocker.patch.object( - capacity_block_manager, "_capacity_blocks_from_config", return_value=capacity_blocks_from_config + capacity_block_manager, + "_retrieve_capacity_blocks_from_fleet_config", + return_value=capacity_blocks_from_config, ) mocked_client = mocker.MagicMock() mocked_client.return_value.describe_capacity_reservations.return_value = capacity_blocks_info_from_ec2 @@ -534,16 +536,18 @@ def test_update_capacity_blocks_info_from_ec2( ), ], ) - def test_capacity_blocks_from_config( + def test_retrieve_capacity_blocks_from_fleet_config( self, capacity_block_manager, fleet_config, expected_capacity_blocks, expected_exception ): capacity_block_manager._fleet_config = fleet_config if expected_exception: with pytest.raises(KeyError): - capacity_block_manager._capacity_blocks_from_config() + capacity_block_manager._retrieve_capacity_blocks_from_fleet_config() else: - assert_that(expected_capacity_blocks).is_equal_to(capacity_block_manager._capacity_blocks_from_config()) + assert_that(expected_capacity_blocks).is_equal_to( + capacity_block_manager._retrieve_capacity_blocks_from_fleet_config() + ) @pytest.mark.parametrize( ("compute_resource_config", "expected_result"), From 7b9f34d78259c1dea497cbc3ade28f220c870f43 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Oct 2023 15:01:44 +0200 Subject: [PATCH 18/35] Avoid to raise an exception if there is an error updating a single capacity block Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 7 ++-- .../test_capacity_block_manager.py | 38 +++++++++---------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 8b0ce4921..ffb98879b 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -277,7 +277,7 @@ def _update_capacity_blocks_info_from_ec2(self): """ capacity_block_ids = self._capacity_blocks.keys() logger.info( - "Retrieving updated Capacity Block reservation information from EC2 for %s", + "Retrieving Capacity Block reservation information from EC2 for %s", ",".join(capacity_block_ids), ) capacity_block_reservations_info: List[ @@ -289,9 +289,8 @@ def _update_capacity_blocks_info_from_ec2(self): try: self._capacity_blocks[capacity_block_id].update_ec2_info(capacity_block_reservation_info) except KeyError: - raise CapacityBlockManagerError( - f"Unable to find capacity block {capacity_block_id} in the internal map" - ) + # should never happen + logger.error(f"Unable to find capacity block {capacity_block_id} in the internal map.") def _retrieve_capacity_blocks_from_fleet_config(self): """ diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 6d0aef556..8b08c9074 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -146,13 +146,13 @@ def test_ec2_client(self, capacity_block_manager, mocker): CapacityBlockReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} ), - # add another id not in the capacity block to trigger an exception + # add another id not in the capacity block to trigger an internal error, that does not stop the loop CapacityBlockReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-234567"} ), ], [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], - ["node1"], + ["queue-cb-st-compute-resource-cb-1"], ), ( True, @@ -192,6 +192,7 @@ def test_get_reserved_nodenames( capacity_blocks_info_from_ec2, nodes, expected_reserved_nodenames, + caplog, ): mocker.patch.object( capacity_block_manager, "_is_time_to_update_capacity_blocks_info", return_value=is_time_to_update @@ -213,8 +214,8 @@ def test_get_reserved_nodenames( # use a trick to trigger an internal exception expected_exception = len(capacity_blocks_from_config) != len(capacity_blocks_info_from_ec2) if expected_exception: - with pytest.raises(CapacityBlockManagerError): - capacity_block_manager.get_reserved_nodenames(nodes) + capacity_block_manager.get_reserved_nodenames(nodes) + assert_that(caplog.text).contains("Unable to find capacity block") assert_that(capacity_block_manager._reserved_nodenames).is_equal_to(expected_reserved_nodenames) else: reserved_nodenames = capacity_block_manager.get_reserved_nodenames(nodes) @@ -222,17 +223,12 @@ def test_get_reserved_nodenames( assert_that(capacity_block_manager._reserved_nodenames).is_equal_to(expected_reserved_nodenames) if is_time_to_update: - if expected_exception: - assert_that(capacity_block_manager._capacity_blocks_update_time).is_equal_to(previous_update_time) - update_res_mock.assert_not_called() - cleanup_mock.assert_not_called() - else: - assert_that(capacity_block_manager._capacity_blocks_update_time).is_not_equal_to(previous_update_time) - update_res_mock.assert_has_calls( - [call(capacity_block) for capacity_block in capacity_block_manager._capacity_blocks.values()], - any_order=True, - ) - cleanup_mock.assert_called_once() + assert_that(capacity_block_manager._capacity_blocks_update_time).is_not_equal_to(previous_update_time) + update_res_mock.assert_has_calls( + [call(capacity_block) for capacity_block in capacity_block_manager._capacity_blocks.values()], + any_order=True, + ) + cleanup_mock.assert_called_once() assert_that(capacity_block_manager._capacity_blocks).is_equal_to(capacity_blocks_from_config) else: @@ -413,7 +409,7 @@ def test_update_slurm_reservation( assert_that(caplog.text).contains(msg_prefix + "Nothing to do. No existing" + msg_suffix) @pytest.mark.parametrize( - ("init_capacity_blocks", "capacity_blocks_info_from_ec2", "expected_exception"), + ("init_capacity_blocks", "capacity_blocks_info_from_ec2", "expected_error"), [ # nothing in the config ({}, [], None), @@ -456,17 +452,19 @@ def test_update_capacity_blocks_info_from_ec2( capacity_block_manager, init_capacity_blocks, capacity_blocks_info_from_ec2, - expected_exception, + expected_error, + caplog, ): + caplog.set_level(logging.INFO) capacity_block_manager._capacity_blocks = init_capacity_blocks mocked_client = mocker.MagicMock() mocked_client.return_value.describe_capacity_reservations.return_value = capacity_blocks_info_from_ec2 capacity_block_manager._ec2_client = mocked_client - if expected_exception: - with pytest.raises(CapacityBlockManagerError, match=expected_exception): - capacity_block_manager._update_capacity_blocks_info_from_ec2() + if expected_error: + capacity_block_manager._update_capacity_blocks_info_from_ec2() + assert_that(caplog.text).contains(expected_error) # assert that only existing item has been updated assert_that( From 2c373f8492bc021196f13f6191298a2e0df32c0b Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Oct 2023 15:36:33 +0200 Subject: [PATCH 19/35] Add essential logic to manage internal CapacityBlockManager exceptions Now all the main logic is in a try/except block. The manager is not raising any exception, but instead keeping the previous value for the list of reserved nodenames. Added logic to manage AWSClientError when contacting boto3 and converting it to CapacityBlockManagerError. Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 79 +++++++++++-------- .../test_capacity_block_manager.py | 71 +++++++++++++---- 2 files changed, 101 insertions(+), 49 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index ffb98879b..65276fbdd 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -23,6 +23,7 @@ from common.time_utils import seconds_to_minutes from slurm_plugin.slurm_resources import SlurmNode +from aws.common import AWSClientError from aws.ec2 import CapacityBlockReservationInfo, Ec2Client logger = logging.getLogger(__name__) @@ -131,35 +132,42 @@ def ec2_client(self): def get_reserved_nodenames(self, nodes: List[SlurmNode]): """Manage nodes part of capacity block reservation. Returns list of reserved nodes.""" - # evaluate if it's the moment to update info - now = datetime.now(tz=timezone.utc) - if self._is_time_to_update_capacity_blocks_info(now): - reserved_nodenames = [] + try: + # evaluate if it's the moment to update info + now = datetime.now(tz=timezone.utc) + if self._is_time_to_update_capacity_blocks_info(now): + reserved_nodenames = [] - # update list of capacity blocks from fleet config - self._capacity_blocks = self._retrieve_capacity_blocks_from_fleet_config() + # update list of capacity blocks from fleet config + self._capacity_blocks = self._retrieve_capacity_blocks_from_fleet_config() - if self._capacity_blocks: - # update capacity blocks details from ec2 (e.g. state) - self._update_capacity_blocks_info_from_ec2() - # associate nodenames to capacity blocks, - # according to queues and compute resources from fleet configuration - self._associate_nodenames_to_capacity_blocks(nodes) + if self._capacity_blocks: + # update capacity blocks details from ec2 (e.g. state) + self._update_capacity_blocks_info_from_ec2() + # associate nodenames to capacity blocks, + # according to queues and compute resources from fleet configuration + self._associate_nodenames_to_capacity_blocks(nodes) - # create, update or delete slurm reservation for the nodes according to CB details. - for capacity_block in self._capacity_blocks.values(): - self._update_slurm_reservation(capacity_block) + # create, update or delete slurm reservation for the nodes according to CB details. + for capacity_block in self._capacity_blocks.values(): + self._update_slurm_reservation(capacity_block) - # If CB is in not yet active or expired add nodes to list of reserved nodes - if not capacity_block.is_active(): - reserved_nodenames.extend(capacity_block.nodenames()) + # If CB is in not yet active or expired add nodes to list of reserved nodes + if not capacity_block.is_active(): + reserved_nodenames.extend(capacity_block.nodenames()) - self._reserved_nodenames = reserved_nodenames + self._reserved_nodenames = reserved_nodenames - # delete slurm reservations created by CapacityBlockManager not associated to existing capacity blocks - self._cleanup_leftover_slurm_reservations() + # delete slurm reservations created by CapacityBlockManager not associated to existing capacity blocks + self._cleanup_leftover_slurm_reservations() - self._capacity_blocks_update_time = now + self._capacity_blocks_update_time = now + except Exception as e: + logger.error( + "Unable to retrieve list of reserved nodes, maintaining old list: %s. Error: %s", + self._reserved_nodenames, + e, + ) return self._reserved_nodenames @@ -280,17 +288,22 @@ def _update_capacity_blocks_info_from_ec2(self): "Retrieving Capacity Block reservation information from EC2 for %s", ",".join(capacity_block_ids), ) - capacity_block_reservations_info: List[ - CapacityBlockReservationInfo - ] = self.ec2_client().describe_capacity_reservations(capacity_block_ids) - - for capacity_block_reservation_info in capacity_block_reservations_info: - capacity_block_id = capacity_block_reservation_info.capacity_reservation_id() - try: - self._capacity_blocks[capacity_block_id].update_ec2_info(capacity_block_reservation_info) - except KeyError: - # should never happen - logger.error(f"Unable to find capacity block {capacity_block_id} in the internal map.") + try: + capacity_block_reservations_info: List[ + CapacityBlockReservationInfo + ] = self.ec2_client().describe_capacity_reservations(capacity_block_ids) + + for capacity_block_reservation_info in capacity_block_reservations_info: + capacity_block_id = capacity_block_reservation_info.capacity_reservation_id() + try: + self._capacity_blocks[capacity_block_id].update_ec2_info(capacity_block_reservation_info) + except KeyError: + # should never happen + logger.error(f"Unable to find Capacity Block {capacity_block_id} in the internal map.") + except AWSClientError as e: + msg = f"Unable to retrieve Capacity Blocks information from EC2. {e}" + logger.error(msg) + raise CapacityBlockManagerError(msg) def _retrieve_capacity_blocks_from_fleet_config(self): """ diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 8b08c9074..9abe7b3fe 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -22,6 +22,7 @@ ) from slurm_plugin.slurm_resources import DynamicNode, SlurmReservation, StaticNode +from aws.common import AWSClientError from aws.ec2 import CapacityBlockReservationInfo FAKE_CAPACITY_BLOCK_ID = "cr-a1234567" @@ -136,7 +137,7 @@ def test_ec2_client(self, capacity_block_manager, mocker): [], # empty because capacity block from config is empty ), ( - # preserve old value because there is an exception + # update values because there is only an internal error True, ["node1"], { @@ -154,6 +155,17 @@ def test_ec2_client(self, capacity_block_manager, mocker): [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], ["queue-cb-st-compute-resource-cb-1"], ), + ( + # preserve old value because there is an exception + True, + ["node1"], + { + "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), + }, + AWSClientError("describe_capacity_reservations", "Boto3Error"), + [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + ["node1"], + ), ( True, ["node1"], @@ -203,7 +215,10 @@ def test_get_reserved_nodenames( return_value=capacity_blocks_from_config, ) mocked_client = mocker.MagicMock() - mocked_client.return_value.describe_capacity_reservations.return_value = capacity_blocks_info_from_ec2 + expected_exception = isinstance(capacity_blocks_info_from_ec2, AWSClientError) + mocked_client.return_value.describe_capacity_reservations.side_effect = [ + capacity_blocks_info_from_ec2 if expected_exception else capacity_blocks_info_from_ec2 + ] capacity_block_manager._ec2_client = mocked_client update_res_mock = mocker.patch.object(capacity_block_manager, "_update_slurm_reservation") cleanup_mock = mocker.patch.object(capacity_block_manager, "_cleanup_leftover_slurm_reservations") @@ -211,18 +226,20 @@ def test_get_reserved_nodenames( previous_update_time = datetime(2020, 1, 1, 0, 0, 0) capacity_block_manager._capacity_blocks_update_time = previous_update_time - # use a trick to trigger an internal exception - expected_exception = len(capacity_blocks_from_config) != len(capacity_blocks_info_from_ec2) + reserved_nodenames = capacity_block_manager.get_reserved_nodenames(nodes) if expected_exception: - capacity_block_manager.get_reserved_nodenames(nodes) - assert_that(caplog.text).contains("Unable to find capacity block") - assert_that(capacity_block_manager._reserved_nodenames).is_equal_to(expected_reserved_nodenames) + assert_that(caplog.text).contains("Unable to retrieve list of reserved nodes, maintaining old list") else: - reserved_nodenames = capacity_block_manager.get_reserved_nodenames(nodes) - assert_that(reserved_nodenames).is_equal_to(expected_reserved_nodenames) + expected_internal_error = len(capacity_blocks_from_config) != len(capacity_blocks_info_from_ec2) + if expected_internal_error: + assert_that(caplog.text).contains("Unable to find Capacity Block") + assert_that(capacity_block_manager._reserved_nodenames).is_equal_to(expected_reserved_nodenames) + else: + assert_that(reserved_nodenames).is_equal_to(expected_reserved_nodenames) + assert_that(capacity_block_manager._reserved_nodenames).is_equal_to(expected_reserved_nodenames) - if is_time_to_update: + if is_time_to_update and not expected_exception: assert_that(capacity_block_manager._capacity_blocks_update_time).is_not_equal_to(previous_update_time) update_res_mock.assert_has_calls( [call(capacity_block) for capacity_block in capacity_block_manager._capacity_blocks.values()], @@ -230,7 +247,6 @@ def test_get_reserved_nodenames( ) cleanup_mock.assert_called_once() assert_that(capacity_block_manager._capacity_blocks).is_equal_to(capacity_blocks_from_config) - else: assert_that(capacity_block_manager._capacity_blocks_update_time).is_equal_to(previous_update_time) update_res_mock.assert_not_called() @@ -413,7 +429,7 @@ def test_update_slurm_reservation( [ # nothing in the config ({}, [], None), - # exception, because trying to update a capacity block not in the list, keep previous values + # exception, keep previous values ( { "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), @@ -426,7 +442,22 @@ def test_update_slurm_reservation( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} ), ], - "Unable to find capacity block cr-234567", + AWSClientError("describe_capacity_reservations", "Boto3Error"), + ), + # internal error, because trying to update a capacity block not in the list, keep previous values + ( + { + "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), + }, + [ + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} + ), + CapacityBlockReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} + ), + ], + "Unable to find Capacity Block cr-234567", ), # update old values with new values ( @@ -457,12 +488,20 @@ def test_update_capacity_blocks_info_from_ec2( ): caplog.set_level(logging.INFO) capacity_block_manager._capacity_blocks = init_capacity_blocks - + expected_exception = isinstance(expected_error, AWSClientError) mocked_client = mocker.MagicMock() - mocked_client.return_value.describe_capacity_reservations.return_value = capacity_blocks_info_from_ec2 + mocked_client.return_value.describe_capacity_reservations.side_effect = [ + expected_error if expected_exception else capacity_blocks_info_from_ec2 + ] capacity_block_manager._ec2_client = mocked_client - if expected_error: + if expected_exception: + with pytest.raises( + CapacityBlockManagerError, match="Unable to retrieve Capacity Blocks information from EC2. Boto3Error" + ): + capacity_block_manager._update_capacity_blocks_info_from_ec2() + + elif expected_error: capacity_block_manager._update_capacity_blocks_info_from_ec2() assert_that(caplog.text).contains(expected_error) From da62e0f6d2b88a4372cee85f3b9ab8cdc2e0353a Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Oct 2023 18:47:46 +0200 Subject: [PATCH 20/35] Manage internal errors when managing Capacity Blocks and related slurm reservations Defined a new SlurmCommandError to identify errors coming from scontrol commands. Defined a new SlurmCommandErrorHandler to permit to handle SlurmCommandError and log messages. Added retry (max-attempt:2, wait=1s) and SlurmCommandErrorHandler decorator to all the reservations commands. Improved is_slurm_reservation command to be able to parse stderr and stdout to retrieve reservation information. Now the main method `get_reserved_nodenames`, called by clustermgtd cannot raise any exception. If the error in the `update_slurm_reservation` happens when updating a single capacity block/slurm reservation, this is catched and the loop will continue, updating the others. If there is a generic error like AWSClientError (wrapped to CapacityBlockManagerError) the entire list of capacity_blocks and reserved_nodes won't be changed, logging an error. If cleanup_leftover_slurm_reservation fails this will be logged, but the process will continue. Extended unit tests to cover command retries and error catching. Signed-off-by: Enrico Usai --- .../schedulers/slurm_reservation_commands.py | 50 ++++- src/common/utils.py | 27 +++ src/slurm_plugin/capacity_block_manager.py | 156 ++++++++------ .../test_slurm_reservation_commands.py | 199 ++++++++++++++++-- .../test_capacity_block_manager.py | 12 +- 5 files changed, 349 insertions(+), 95 deletions(-) diff --git a/src/common/schedulers/slurm_reservation_commands.py b/src/common/schedulers/slurm_reservation_commands.py index cc264518f..8c9ed5d10 100644 --- a/src/common/schedulers/slurm_reservation_commands.py +++ b/src/common/schedulers/slurm_reservation_commands.py @@ -17,7 +17,14 @@ from typing import List from common.schedulers.slurm_commands import DEFAULT_SCONTROL_COMMAND_TIMEOUT, SCONTROL -from common.utils import check_command_output, run_command, validate_subprocess_argument +from common.utils import ( + SlurmCommandError, + SlurmCommandErrorHandler, + check_command_output, + run_command, + validate_subprocess_argument, +) +from retrying import retry from slurm_plugin.slurm_resources import SlurmReservation logger = logging.getLogger(__name__) @@ -61,6 +68,12 @@ def _create_or_update_reservation( run_command(cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True) # nosec B604 +@retry( + stop_max_attempt_number=2, + wait_fixed=1000, + retry_on_exception=lambda exception: isinstance(exception, SlurmCommandError), +) +@SlurmCommandErrorHandler.handle_slurm_command_error def create_slurm_reservation( name: str, nodes: str = "ALL", @@ -93,6 +106,12 @@ def create_slurm_reservation( ) +@retry( + stop_max_attempt_number=2, + wait_fixed=1000, + retry_on_exception=lambda exception: isinstance(exception, SlurmCommandError), +) +@SlurmCommandErrorHandler.handle_slurm_command_error def update_slurm_reservation( name: str, nodes: str = None, @@ -121,6 +140,12 @@ def update_slurm_reservation( ) +@retry( + stop_max_attempt_number=2, + wait_fixed=1000, + retry_on_exception=lambda exception: isinstance(exception, SlurmCommandError), +) +@SlurmCommandErrorHandler.handle_slurm_command_error def delete_slurm_reservation( name: str, command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT, @@ -149,6 +174,12 @@ def _add_param(cmd, param_name, value): return cmd +@retry( + stop_max_attempt_number=2, + wait_fixed=1000, + retry_on_exception=lambda exception: isinstance(exception, SlurmCommandError), +) +@SlurmCommandErrorHandler.handle_slurm_command_error def is_slurm_reservation( name: str, command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT, @@ -187,17 +218,26 @@ def is_slurm_reservation( reservation_exists = f"ReservationName={name}" in output except subprocess.CalledProcessError as e: - output = e.stdout.rstrip() - if output == f"Reservation {name} not found": + expected_output = f"Reservation {name} not found" + error = f" Error is: {e.stderr.rstrip()}." if e.stderr else "" + output = f" Output is: {e.stdout.rstrip()}." if e.stdout else "" + if expected_output in error or expected_output in output: logger.info(f"Slurm reservation {name} not found.") reservation_exists = False else: - logger.error("Failed when retrieving Slurm reservation info with command %s. Error: %s", cmd, output) - raise e + msg = f"Failed when retrieving Slurm reservation info with command {cmd}.{error}{output} {e}" + logger.error(msg) + raise SlurmCommandError(msg) return reservation_exists +@retry( + stop_max_attempt_number=2, + wait_fixed=1000, + retry_on_exception=lambda exception: isinstance(exception, SlurmCommandError), +) +@SlurmCommandErrorHandler.handle_slurm_command_error def get_slurm_reservations_info( command_timeout=DEFAULT_SCONTROL_COMMAND_TIMEOUT, raise_on_error: bool = True ) -> List[SlurmReservation]: diff --git a/src/common/utils.py b/src/common/utils.py index af1a059cd..2cf08e095 100644 --- a/src/common/utils.py +++ b/src/common/utils.py @@ -12,6 +12,7 @@ import collections import contextlib +import functools import itertools import json import logging @@ -45,6 +46,32 @@ class EventType(Enum): UpdateEvent = collections.namedtuple("UpdateEvent", ["action", "message", "host"]) +class SlurmCommandError(Exception): + def __init__(self, message: str): + super().__init__(message) + + +class SlurmCommandErrorHandler: + """Handle SlurmCommandError.""" + + @staticmethod + def handle_slurm_command_error(func): + """Handle slurm command errors, can be used as a decorator.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except subprocess.CalledProcessError as e: + error = f" Error is: {e.stderr.rstrip()}." if e.stderr else "" + output = f" Output is: {e.stdout.rstrip()}." if e.stdout else "" + msg = f"Failed to execute slurm command.{error}{output} {e}" + log.error(msg) + raise SlurmCommandError(msg) + + return wrapper + + def load_module(module): """ Load python module. diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 65276fbdd..efc5e9b32 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -21,6 +21,7 @@ update_slurm_reservation, ) from common.time_utils import seconds_to_minutes +from common.utils import SlurmCommandError from slurm_plugin.slurm_resources import SlurmNode from aws.common import AWSClientError @@ -55,7 +56,7 @@ class CapacityBlock: Contains info like: - queue and compute resource from the config, - state from EC2, - - name of the related slurm reservation. + - name of the slurm reservation that will be created for this CB. """ def __init__(self, capacity_block_id, queue_name, compute_resource_name): @@ -65,7 +66,7 @@ def __init__(self, capacity_block_id, queue_name, compute_resource_name): self._capacity_block_reservation_info = None self._nodenames = [] - def update_ec2_info(self, capacity_block_reservation_info: CapacityBlockReservationInfo): + def update_capacity_block_reservation_info(self, capacity_block_reservation_info: CapacityBlockReservationInfo): """Update info from CapacityBlockReservationInfo.""" self._capacity_block_reservation_info = capacity_block_reservation_info @@ -138,33 +139,40 @@ def get_reserved_nodenames(self, nodes: List[SlurmNode]): if self._is_time_to_update_capacity_blocks_info(now): reserved_nodenames = [] - # update list of capacity blocks from fleet config - self._capacity_blocks = self._retrieve_capacity_blocks_from_fleet_config() - - if self._capacity_blocks: + # find an updated list of capacity blocks from fleet config + capacity_blocks = self._retrieve_capacity_blocks_from_fleet_config() + if capacity_blocks: # update capacity blocks details from ec2 (e.g. state) - self._update_capacity_blocks_info_from_ec2() + self._update_capacity_blocks_info_from_ec2(capacity_blocks) # associate nodenames to capacity blocks, # according to queues and compute resources from fleet configuration - self._associate_nodenames_to_capacity_blocks(nodes) + self._associate_nodenames_to_capacity_blocks(capacity_blocks, nodes) # create, update or delete slurm reservation for the nodes according to CB details. - for capacity_block in self._capacity_blocks.values(): + for capacity_block in capacity_blocks.values(): self._update_slurm_reservation(capacity_block) # If CB is in not yet active or expired add nodes to list of reserved nodes if not capacity_block.is_active(): reserved_nodenames.extend(capacity_block.nodenames()) + # Once all the steps have been successful, update object attributes + self._capacity_blocks = capacity_blocks + self._capacity_blocks_update_time = now self._reserved_nodenames = reserved_nodenames # delete slurm reservations created by CapacityBlockManager not associated to existing capacity blocks self._cleanup_leftover_slurm_reservations() - self._capacity_blocks_update_time = now + except (SlurmCommandError, CapacityBlockManagerError) as e: + logger.error( + "Unable to retrieve list of reserved nodes, maintaining old list: %s. %s", + self._reserved_nodenames, + e, + ) except Exception as e: logger.error( - "Unable to retrieve list of reserved nodes, maintaining old list: %s. Error: %s", + "Unexpected error. Unable to retrieve list of reserved nodes, maintaining old list: %s. %s", self._reserved_nodenames, e, ) @@ -185,48 +193,54 @@ def _is_time_to_update_capacity_blocks_info(self, now: datetime): > CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD ) - def _associate_nodenames_to_capacity_blocks(self, nodes: List[SlurmNode]): + @staticmethod + def _associate_nodenames_to_capacity_blocks(capacity_blocks: Dict[str, CapacityBlock], nodes: List[SlurmNode]): """ Update capacity_block info adding nodenames list. Check configured CBs and associate nodes to them according to queue and compute resource info. """ for node in nodes: - for capacity_block in self._capacity_blocks.values(): + for capacity_block in capacity_blocks.values(): if capacity_block.does_node_belong_to(node): capacity_block.add_nodename(node.name) break def _cleanup_leftover_slurm_reservations(self): """Find list of slurm reservations created by ParallelCluster but not part of the configured CBs.""" - slurm_reservations = get_slurm_reservations_info() - for slurm_reservation in slurm_reservations: - if CapacityBlock.is_capacity_block_slurm_reservation(slurm_reservation.name): - capacity_block_id = CapacityBlock.slurm_reservation_name_to_id(slurm_reservation.name) - if capacity_block_id not in self._capacity_blocks.keys(): - logger.info( - ( - "Found leftover slurm reservation %s for nodes %s. " - "Related Capacity Block %s is no longer in the cluster configuration. " - "Deleting the slurm reservation." - ), - slurm_reservation.name, - slurm_reservation.nodes, - capacity_block_id, - ) - delete_slurm_reservation(name=slurm_reservation.name) + try: + for slurm_reservation in get_slurm_reservations_info(): + if CapacityBlock.is_capacity_block_slurm_reservation(slurm_reservation.name): + capacity_block_id = CapacityBlock.slurm_reservation_name_to_id(slurm_reservation.name) + if capacity_block_id not in self._capacity_blocks.keys(): + logger.info( + ( + "Found leftover slurm reservation %s for nodes %s. " + "Related Capacity Block %s is no longer in the cluster configuration. " + "Deleting the slurm reservation." + ), + slurm_reservation.name, + slurm_reservation.nodes, + capacity_block_id, + ) + try: + delete_slurm_reservation(name=slurm_reservation.name) + except SlurmCommandError as e: + logger.error("Unable to delete slurm reservation %s. %s", slurm_reservation.name, e) + else: + logger.debug( + ( + "Slurm reservation %s is managed by ParallelCluster " + "and related to an existing Capacity Block. Skipping it." + ), + slurm_reservation.name, + ) else: logger.debug( - ( - "Slurm reservation %s is managed by ParallelCluster " - "and related to an existing Capacity Block. Skipping it." - ), - slurm_reservation.name, + "Slurm reservation %s is not managed by ParallelCluster. Skipping it.", slurm_reservation.name ) - else: - logger.debug( - "Slurm reservation %s is not managed by ParallelCluster. Skipping it.", slurm_reservation.name - ) + except SlurmCommandError as e: + logger.error("Unable to retrieve list of existing Slurm reservations. %s", e) @staticmethod def _update_slurm_reservation(capacity_block: CapacityBlock): @@ -251,39 +265,47 @@ def _log_cb_info(action_info): slurm_reservation_name = capacity_block.slurm_reservation_name() capacity_block_nodenames = ",".join(capacity_block.nodenames()) - reservation_exists = is_slurm_reservation(name=slurm_reservation_name) - # if CB is active we need to remove Slurm reservation and start nodes - if capacity_block.is_active(): - # if Slurm reservation exists, delete it. - if reservation_exists: - _log_cb_info("Deleting") - delete_slurm_reservation(name=slurm_reservation_name) - else: - _log_cb_info("Nothing to do. No existing") - - # if CB is expired or not active we need to (re)create Slurm reservation - # to avoid considering nodes as unhealthy - else: - # create or update Slurm reservation - if reservation_exists: - _log_cb_info("Updating existing") - update_slurm_reservation(name=slurm_reservation_name, nodes=capacity_block_nodenames) + try: + reservation_exists = is_slurm_reservation(name=slurm_reservation_name) + # if CB is active we need to remove Slurm reservation and start nodes + if capacity_block.is_active(): + # if Slurm reservation exists, delete it. + if reservation_exists: + _log_cb_info("Deleting") + delete_slurm_reservation(name=slurm_reservation_name) + else: + _log_cb_info("Nothing to do. No existing") + + # if CB is expired or not active we need to (re)create Slurm reservation + # to avoid considering nodes as unhealthy else: - _log_cb_info("Creating") - create_slurm_reservation( - name=slurm_reservation_name, - start_time=datetime.now(tz=timezone.utc), - nodes=capacity_block_nodenames, - ) - - def _update_capacity_blocks_info_from_ec2(self): + # create or update Slurm reservation + if reservation_exists: + _log_cb_info("Updating existing") + update_slurm_reservation(name=slurm_reservation_name, nodes=capacity_block_nodenames) + else: + _log_cb_info("Creating") + create_slurm_reservation( + name=slurm_reservation_name, + start_time=datetime.now(tz=timezone.utc), + nodes=capacity_block_nodenames, + ) + except SlurmCommandError as e: + logger.error( + "Unable to update slurm reservation %s for Capacity Block %s. Skipping it. %s", + slurm_reservation_name, + capacity_block.capacity_block_id, + e, + ) + + def _update_capacity_blocks_info_from_ec2(self, capacity_blocks: Dict[str, CapacityBlock]): """ - Update capacity blocks in self._capacity_blocks by adding capacity reservation info. + Update capacity blocks in given capacity_blocks by adding capacity reservation info. This method is called every time the CapacityBlockManager is re-initialized, so when it starts/is restarted or when fleet configuration changes. """ - capacity_block_ids = self._capacity_blocks.keys() + capacity_block_ids = capacity_blocks.keys() logger.info( "Retrieving Capacity Block reservation information from EC2 for %s", ",".join(capacity_block_ids), @@ -296,7 +318,9 @@ def _update_capacity_blocks_info_from_ec2(self): for capacity_block_reservation_info in capacity_block_reservations_info: capacity_block_id = capacity_block_reservation_info.capacity_reservation_id() try: - self._capacity_blocks[capacity_block_id].update_ec2_info(capacity_block_reservation_info) + capacity_blocks[capacity_block_id].update_capacity_block_reservation_info( + capacity_block_reservation_info + ) except KeyError: # should never happen logger.error(f"Unable to find Capacity Block {capacity_block_id} in the internal map.") diff --git a/tests/common/schedulers/test_slurm_reservation_commands.py b/tests/common/schedulers/test_slurm_reservation_commands.py index 1d899d7fc..b640a7446 100644 --- a/tests/common/schedulers/test_slurm_reservation_commands.py +++ b/tests/common/schedulers/test_slurm_reservation_commands.py @@ -9,8 +9,8 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. import logging +import subprocess from datetime import datetime -from subprocess import CalledProcessError import pytest from assertpy import assert_that @@ -26,6 +26,7 @@ is_slurm_reservation, update_slurm_reservation, ) +from common.utils import SlurmCommandError from slurm_plugin.slurm_resources import SlurmReservation @@ -125,11 +126,46 @@ def test_create_or_update_reservation( @pytest.mark.parametrize( - "name, nodes, partition, user, start_time, duration, number_of_nodes, flags", + ( + "name, nodes, partition, user, start_time, duration, number_of_nodes, flags, " + "run_command_output, expected_exception" + ), [ - ("root_1", None, None, None, None, None, None, None), + ("root_1", None, None, None, None, None, None, None, None, False), ( + # 1 failure, retry and then proceed "root_2", + None, + None, + None, + None, + None, + None, + None, + [ + subprocess.CalledProcessError(1, "error-msg"), + None, + ], + False, + ), + ( + # 2 failures, exception will be raised by the command + "root_3", + None, + None, + None, + None, + None, + None, + None, + [ + subprocess.CalledProcessError(1, "error-msg"), + subprocess.CalledProcessError(1, "error-msg", output="some-output", stderr="some-error"), + ], + True, + ), + ( + "root_4", "nodes-1,nodes[2-6]", "queue1", "user1", @@ -137,6 +173,8 @@ def test_create_or_update_reservation( 180, 10, "testflag", + None, + False, ), ], ) @@ -150,8 +188,14 @@ def test_create_slurm_reservation( number_of_nodes, flags, mocker, + run_command_output, + expected_exception, ): - run_cmd_mock = mocker.patch("common.schedulers.slurm_reservation_commands._create_or_update_reservation") + mocker.patch("time.sleep") + run_cmd_mock = mocker.patch( + "common.schedulers.slurm_reservation_commands._create_or_update_reservation", + side_effect=run_command_output, + ) # Compose command to avoid passing None values kwargs = {"name": name} @@ -170,7 +214,13 @@ def test_create_slurm_reservation( if flags: kwargs.update({"flags": flags}) - create_slurm_reservation(**kwargs) + if expected_exception: + with pytest.raises( + SlurmCommandError, match="Failed to execute slurm command. Error is: some-error. Output is: some-output" + ): + create_slurm_reservation(**kwargs) + else: + create_slurm_reservation(**kwargs) # check expected internal call nodes = nodes if nodes else "ALL" @@ -192,11 +242,46 @@ def test_create_slurm_reservation( @pytest.mark.parametrize( - "name, nodes, partition, user, start_time, duration, number_of_nodes, flags", + ( + "name, nodes, partition, user, start_time, duration, number_of_nodes, flags," + "run_command_output, expected_exception" + ), [ - ("root_1", None, None, None, None, None, None, None), + ("root_1", None, None, None, None, None, None, None, None, False), ( + # 1 failure, retry and then proceed "root_2", + None, + None, + None, + None, + None, + None, + None, + [ + subprocess.CalledProcessError(1, "error-msg"), + None, + ], + False, + ), + ( + # 2 failures, exception will be raised by the command + "root_3", + None, + None, + None, + None, + None, + None, + None, + [ + subprocess.CalledProcessError(1, "error-msg"), + subprocess.CalledProcessError(1, "error-msg", output="some-output", stderr="some-error"), + ], + True, + ), + ( + "root_4", "nodes-1,nodes[2-6]", "queue1", "user1", @@ -204,6 +289,8 @@ def test_create_slurm_reservation( 180, 10, "testflag", + None, + False, ), ], ) @@ -217,8 +304,14 @@ def test_update_slurm_reservation( number_of_nodes, flags, mocker, + run_command_output, + expected_exception, ): - run_cmd_mock = mocker.patch("common.schedulers.slurm_reservation_commands._create_or_update_reservation") + mocker.patch("time.sleep") + run_cmd_mock = mocker.patch( + "common.schedulers.slurm_reservation_commands._create_or_update_reservation", + side_effect=run_command_output, + ) # Compose command to avoid passing None values kwargs = {"name": name} @@ -237,7 +330,13 @@ def test_update_slurm_reservation( if flags: kwargs.update({"flags": flags}) - update_slurm_reservation(**kwargs) + if expected_exception: + with pytest.raises( + SlurmCommandError, match="Failed to execute slurm command. Error is: some-error. Output is: some-output" + ): + update_slurm_reservation(**kwargs) + else: + update_slurm_reservation(**kwargs) # check expected internal call run_cmd_mock.assert_called_with( @@ -256,12 +355,42 @@ def test_update_slurm_reservation( @pytest.mark.parametrize( - "name, cmd_call_kwargs", - [("root_1", {"name": "root_1"})], + "name, cmd_call_kwargs, run_command_output, expected_exception", + [ + ("root_1", {"name": "root_1"}, None, False), + ( + "root_1", + {"name": "root_1"}, + [ + subprocess.CalledProcessError(1, "error-msg"), + None, + ], + False, + ), + ( + "root_1", + {"name": "root_1"}, + [ + subprocess.CalledProcessError(1, "error-msg"), + subprocess.CalledProcessError(1, "error-msg", output="some-output", stderr="some-error"), + ], + True, + ), + ], ) -def test_delete_reservation(name, cmd_call_kwargs, mocker): - run_cmd_mock = mocker.patch("common.schedulers.slurm_reservation_commands.run_command") - delete_slurm_reservation(name) +def test_delete_reservation(name, cmd_call_kwargs, mocker, run_command_output, expected_exception): + mocker.patch("time.sleep") + run_cmd_mock = mocker.patch( + "common.schedulers.slurm_reservation_commands.run_command", side_effect=run_command_output + ) + + if expected_exception: + with pytest.raises( + SlurmCommandError, match="Failed to execute slurm command. Error is: some-error. Output is: some-output" + ): + delete_slurm_reservation(name) + else: + delete_slurm_reservation(name) cmd = f"{SCONTROL} delete reservation ReservationName={name}" run_cmd_mock.assert_called_with(cmd, raise_on_error=True, timeout=DEFAULT_SCONTROL_COMMAND_TIMEOUT, shell=True) @@ -283,21 +412,55 @@ def test_add_param(cmd, param_name, value, expected_cmd): ["name", "mocked_output", "expected_output", "expected_exception", "expected_message"], [ ("root_1", ["ReservationName=root_1"], True, False, None), - ("root_1", ["ReservationName=root_2"], False, False, None), # this should not happen, scontrol should exit - ("root_1", CalledProcessError(1, "", "Reservation root_1 not found"), False, False, "root_1 not found"), - ("root_1", CalledProcessError(1, "", "Generic error"), False, True, "Failed when retrieving"), + ("root_2", ["unexpected-output"], False, False, None), # this should not happen, scontrol should exit + # if available retrieve reservation info from stdout, even if there is an exception + ( + "root_3", + [subprocess.CalledProcessError(1, "", output="Reservation root_3 not found")], + False, + False, + "root_3 not found", + ), + # if available retrieve reservation info from stderr, even if there is an exception + ( + "root_4", + [subprocess.CalledProcessError(1, "", stderr="Reservation root_4 not found")], + False, + False, + "root_4 not found", + ), + ( + # 1 retry + "root_5", + [subprocess.CalledProcessError(1, ""), "ReservationName=root_5"], + True, + False, + None, + ), + ( + # 2 retries, it's a failure + "root_6", + [ + subprocess.CalledProcessError(1, "", "Generic error"), + subprocess.CalledProcessError(1, "", "Generic error"), + ], + False, + True, + "Failed when retrieving", + ), ], ) def test_is_slurm_reservation( mocker, name, mocked_output, expected_output, expected_message, expected_exception, caplog ): + mocker.patch("time.sleep") caplog.set_level(logging.INFO) run_cmd_mock = mocker.patch( "common.schedulers.slurm_reservation_commands.check_command_output", side_effect=mocked_output ) if expected_exception: - with pytest.raises(CalledProcessError): + with pytest.raises(SlurmCommandError, match=expected_message): is_slurm_reservation(name) else: assert_that(is_slurm_reservation(name)).is_equal_to(expected_output) diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 9abe7b3fe..0fbd6a3e3 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -58,7 +58,7 @@ class TestCapacityBlock: ) def test_is_active(self, capacity_block, state, expected_output): capacity_block_reservation_info = CapacityBlockReservationInfo({**FAKE_CAPACITY_BLOCK_INFO, "State": state}) - capacity_block.update_ec2_info(capacity_block_reservation_info) + capacity_block.update_capacity_block_reservation_info(capacity_block_reservation_info) assert_that(capacity_block.is_active()).is_equal_to(expected_output) @pytest.mark.parametrize( @@ -307,7 +307,7 @@ def test_associate_nodenames_to_capacity_blocks( expected_nodenames_in_capacity_block, ): capacity_block_manager._capacity_blocks = capacity_blocks - capacity_block_manager._associate_nodenames_to_capacity_blocks(nodes) + capacity_block_manager._associate_nodenames_to_capacity_blocks(capacity_blocks, nodes) for capacity_block_id in capacity_block_manager._capacity_blocks.keys(): # assert in the nodenames list there are only nodes associated to the right queue and compute resource @@ -385,7 +385,7 @@ def test_update_slurm_reservation( ): caplog.set_level(logging.INFO) capacity_block_reservation_info = CapacityBlockReservationInfo({**FAKE_CAPACITY_BLOCK_INFO, "State": state}) - capacity_block.update_ec2_info(capacity_block_reservation_info) + capacity_block.update_capacity_block_reservation_info(capacity_block_reservation_info) capacity_block.add_nodename("node1") capacity_block.add_nodename("node2") nodenames = ",".join(capacity_block.nodenames()) @@ -499,10 +499,10 @@ def test_update_capacity_blocks_info_from_ec2( with pytest.raises( CapacityBlockManagerError, match="Unable to retrieve Capacity Blocks information from EC2. Boto3Error" ): - capacity_block_manager._update_capacity_blocks_info_from_ec2() + capacity_block_manager._update_capacity_blocks_info_from_ec2(init_capacity_blocks) elif expected_error: - capacity_block_manager._update_capacity_blocks_info_from_ec2() + capacity_block_manager._update_capacity_blocks_info_from_ec2(init_capacity_blocks) assert_that(caplog.text).contains(expected_error) # assert that only existing item has been updated @@ -515,7 +515,7 @@ def test_update_capacity_blocks_info_from_ec2( ) assert_that(capacity_block_manager._capacity_blocks.get("cr-234567")).is_none() else: - capacity_block_manager._update_capacity_blocks_info_from_ec2() + capacity_block_manager._update_capacity_blocks_info_from_ec2(init_capacity_blocks) if init_capacity_blocks: # verify that all the blocks have the updated info from ec2 assert_that( From 222f79fd832a254d7fc6a8e44e4fb76c5155c791 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 30 Oct 2023 10:16:18 +0100 Subject: [PATCH 21/35] Rename slurm_reservation_name_to_id to capacity_block_id_from_slurm_reservation_name Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 6 ++++-- tests/slurm_plugin/test_capacity_block_manager.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index efc5e9b32..a24cf83a9 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -95,7 +95,7 @@ def does_node_belong_to(self, node): return node.queue_name == self.queue_name and node.compute_resource_name == self.compute_resource_name @staticmethod - def slurm_reservation_name_to_id(slurm_reservation_name: str): + def capacity_block_id_from_slurm_reservation_name(slurm_reservation_name: str): """Parse slurm reservation name to retrieve related capacity block id.""" return slurm_reservation_name[len(SLURM_RESERVATION_NAME_PREFIX) :] # noqa E203 @@ -211,7 +211,9 @@ def _cleanup_leftover_slurm_reservations(self): try: for slurm_reservation in get_slurm_reservations_info(): if CapacityBlock.is_capacity_block_slurm_reservation(slurm_reservation.name): - capacity_block_id = CapacityBlock.slurm_reservation_name_to_id(slurm_reservation.name) + capacity_block_id = CapacityBlock.capacity_block_id_from_slurm_reservation_name( + slurm_reservation.name + ) if capacity_block_id not in self._capacity_blocks.keys(): logger.info( ( diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 0fbd6a3e3..55ac22adb 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -85,8 +85,10 @@ def test_does_node_belong_to(self, capacity_block, node, expected_output): ("reservation_name", "expected_output"), [("test", ""), (f"{SLURM_RESERVATION_NAME_PREFIX}anything-else", "anything-else")], ) - def test_slurm_reservation_name_to_id(self, reservation_name, expected_output): - assert_that(CapacityBlock.slurm_reservation_name_to_id(reservation_name)).is_equal_to(expected_output) + def test_capacity_block_id_from_slurm_reservation_name(self, reservation_name, expected_output): + assert_that(CapacityBlock.capacity_block_id_from_slurm_reservation_name(reservation_name)).is_equal_to( + expected_output + ) @pytest.mark.parametrize( ("reservation_name", "expected_output"), From 0a7b1a25827aa7c373181a184af71c3a654de748 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 30 Oct 2023 12:21:25 +0100 Subject: [PATCH 22/35] Manage empty allocation strategy when using capacity block Signed-off-by: Enrico Usai --- src/slurm_plugin/fleet_manager.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/slurm_plugin/fleet_manager.py b/src/slurm_plugin/fleet_manager.py index 31ef62eb8..787ccc89f 100644 --- a/src/slurm_plugin/fleet_manager.py +++ b/src/slurm_plugin/fleet_manager.py @@ -314,12 +314,15 @@ def _evaluate_launch_params(self, count): """Evaluate parameters to be passed to create_fleet call.""" try: common_launch_options = { - # AllocationStrategy can assume different values for SpotOptions and OnDemandOptions - "AllocationStrategy": self._compute_resource_config["AllocationStrategy"], "SingleInstanceType": self._uses_single_instance_type(), "SingleAvailabilityZone": self._uses_single_az(), # If using Multi-AZ (by specifying multiple subnets), # set SingleAvailabilityZone to False } + allocation_strategy = self._compute_resource_config.get("AllocationStrategy") + if allocation_strategy: + # AllocationStrategy can assume different values for SpotOptions and OnDemandOptions + # and is not set for Capacity Block + common_launch_options.update({"AllocationStrategy": allocation_strategy}) if self._uses_single_az() or self._uses_single_instance_type(): # If the minimum target capacity is not reached, the fleet launches no instances @@ -334,6 +337,7 @@ def _evaluate_launch_params(self, count): if self._compute_resource_config["CapacityType"] == "spot": launch_options = {"SpotOptions": common_launch_options} else: + # This is valid for both on-demand and capacity-block launch_options = { "OnDemandOptions": { **common_launch_options, From 1e93e63504dcb1e73c699f66b86fe845111be112 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 30 Oct 2023 12:24:59 +0100 Subject: [PATCH 23/35] Add unit tests for fleet manager when using capacity block Signed-off-by: Enrico Usai --- tests/common.py | 16 ++++ tests/slurm_plugin/test_fleet_manager.py | 93 ++++++++++++++++++- .../expected_launch_params.json | 33 +++++++ 3 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_capacity_block/expected_launch_params.json diff --git a/tests/common.py b/tests/common.py index bd4969642..95fd16ccb 100644 --- a/tests/common.py +++ b/tests/common.py @@ -94,6 +94,22 @@ def client_error(error_code): "Networking": MULTIPLE_SUBNETS, }, }, + "queue-cb": { + "run-instances-capacity-block": { + "Api": "run-instances", + "Instances": [{"InstanceType": "c5.xlarge"}], + "CapacityType": "capacity-block", + "Networking": SINGLE_SUBNET, + "CapacityReservationId": "cr-123456", + }, + "fleet-capacity-block": { + "Api": "create-fleet", + "Instances": [{"InstanceType": "t2.medium"}, {"InstanceType": "t2.large"}], + "CapacityType": "capacity-block", + "Networking": SINGLE_SUBNET, + "CapacityReservationId": "cr-234567", + }, + }, } LAUNCH_OVERRIDES = {} diff --git a/tests/slurm_plugin/test_fleet_manager.py b/tests/slurm_plugin/test_fleet_manager.py index f0fe26d3a..1f6e96e88 100644 --- a/tests/slurm_plugin/test_fleet_manager.py +++ b/tests/slurm_plugin/test_fleet_manager.py @@ -329,6 +329,25 @@ class TestEc2CreateFleetManager: "Type": "instant", } + test_capacity_block_params = { + "LaunchTemplateConfigs": [ + { + "LaunchTemplateSpecification": { + "LaunchTemplateName": "queue-cb-fleet-capacity-block", + "Version": "$Latest", + }, + } + ], + "OnDemandOptions": { + "SingleInstanceType": False, + "SingleAvailabilityZone": True, + "MinTargetCapacity": 1, + "CapacityReservationOptions": {"UsageStrategy": "use-capacity-reservations-first"}, + }, + "TargetCapacitySpecification": {"TotalTargetCapacity": 5, "DefaultTargetCapacityType": "capacity-block"}, + "Type": "instant", + } + @pytest.mark.parametrize( ("batch_size", "queue", "compute_resource", "all_or_nothing", "launch_overrides", "log_assertions"), [ @@ -336,6 +355,8 @@ class TestEc2CreateFleetManager: (5, "queue1", "fleet-spot", False, {}, None), # normal - on-demand (5, "queue2", "fleet-ondemand", False, {}, None), + # normal - capacity-block + (5, "queue-cb", "fleet-capacity-block", False, {}, None), # all or nothing (5, "queue1", "fleet-spot", True, {}, None), # launch_overrides @@ -374,6 +395,7 @@ class TestEc2CreateFleetManager: ids=[ "fleet_spot", "fleet_ondemand", + "fleet_capacity_block", "all_or_nothing", "launch_overrides", "fleet-single-az-multi-it-all_or_nothing", @@ -586,6 +608,68 @@ def test_evaluate_launch_params( } ], ), + # normal - capacity-block + ( + test_capacity_block_params, + [ + MockedBoto3Request( + method="create_fleet", + response={ + "Instances": [{"InstanceIds": ["i-12345"]}], + "Errors": [ + {"ErrorCode": "InsufficientInstanceCapacity", "ErrorMessage": "Insufficient capacity."} + ], + "ResponseMetadata": {"RequestId": "1234-abcde"}, + }, + expected_params=test_capacity_block_params, + ), + MockedBoto3Request( + method="describe_instances", + response={ + "Reservations": [ + { + "Instances": [ + { + "InstanceId": "i-12345", + "PrivateIpAddress": "ip-2", + "PrivateDnsName": "hostname", + "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), + "NetworkInterfaces": [ + { + "Attachment": { + "DeviceIndex": 0, + "NetworkCardIndex": 0, + }, + "PrivateIpAddress": "ip-2", + }, + ], + }, + ] + } + ] + }, + expected_params={"InstanceIds": ["i-12345"]}, + generate_error=False, + ), + ], + [ + { + "InstanceId": "i-12345", + "PrivateIpAddress": "ip-2", + "PrivateDnsName": "hostname", + "LaunchTime": datetime(2020, 1, 1, tzinfo=timezone.utc), + "NetworkInterfaces": [ + { + "Attachment": { + "DeviceIndex": 0, + "NetworkCardIndex": 0, + }, + "PrivateIpAddress": "ip-2", + }, + ], + } + ], + ), # create-fleet - throttling ( test_on_demand_params, @@ -624,7 +708,14 @@ def test_evaluate_launch_params( [], ), ], - ids=["fleet_spot", "fleet_exception", "fleet_ondemand", "fleet_throttling", "fleet_multiple_errors"], + ids=[ + "fleet_spot", + "fleet_exception", + "fleet_ondemand", + "fleet_capacity_block", + "fleet_throttling", + "fleet_multiple_errors", + ], ) def test_launch_instances( self, diff --git a/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_capacity_block/expected_launch_params.json b/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_capacity_block/expected_launch_params.json new file mode 100644 index 000000000..67b68a1b7 --- /dev/null +++ b/tests/slurm_plugin/test_fleet_manager/TestEc2CreateFleetManager/test_evaluate_launch_params/fleet_capacity_block/expected_launch_params.json @@ -0,0 +1,33 @@ +{ + "LaunchTemplateConfigs":[ + { + "LaunchTemplateSpecification":{ + "LaunchTemplateName":"hit-queue-cb-fleet-capacity-block", + "Version":"$Latest" + }, + "Overrides":[ + { + "InstanceType":"t2.medium", + "SubnetId":"1234567" + }, + { + "InstanceType":"t2.large", + "SubnetId":"1234567" + } + ] + } + ], + "OnDemandOptions":{ + "SingleInstanceType":false, + "SingleAvailabilityZone":true, + "MinTargetCapacity":1, + "CapacityReservationOptions":{ + "UsageStrategy":"use-capacity-reservations-first" + } + }, + "TargetCapacitySpecification":{ + "TotalTargetCapacity":5, + "DefaultTargetCapacityType":"capacity-block" + }, + "Type":"instant" +} \ No newline at end of file From 4564d1e60e868f6a1afd86374876de21d97d1ae9 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 2 Nov 2023 11:01:34 +0100 Subject: [PATCH 24/35] Rename parameters of is_healthy and is_state_healthy methods of SlurmNodes From a SlurmNode perspective "terminate_down/drain_nodes" does not have any sense. We can instead pass a flag to the is_healthy function to say if we want to consider down/drain nodes as unhealthy. Signed-off-by: Enrico Usai --- src/slurm_plugin/clustermgtd.py | 5 +++- src/slurm_plugin/slurm_resources.py | 24 +++++++++---------- .../slurm_resources/test_slurm_resources.py | 12 ++++++---- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py index 810d2fdfb..21ccaf856 100644 --- a/src/slurm_plugin/clustermgtd.py +++ b/src/slurm_plugin/clustermgtd.py @@ -755,7 +755,10 @@ def _find_unhealthy_slurm_nodes(self, slurm_nodes): ) for node in slurm_nodes: - if not node.is_healthy(self._config.terminate_drain_nodes, self._config.terminate_down_nodes): + if not node.is_healthy( + consider_drain_as_unhealthy=self._config.terminate_drain_nodes, + consider_down_as_unhealthy=self._config.terminate_down_nodes, + ): if not self._config.disable_capacity_blocks_management and node.name in reserved_nodenames: # do not consider as unhealthy the nodes reserved for capacity blocks continue diff --git a/src/slurm_plugin/slurm_resources.py b/src/slurm_plugin/slurm_resources.py index 5a00f550d..9fb9ed708 100644 --- a/src/slurm_plugin/slurm_resources.py +++ b/src/slurm_plugin/slurm_resources.py @@ -373,7 +373,7 @@ def is_invalid_slurm_registration(self): return self.SLURM_SCONTROL_INVALID_REGISTRATION_STATE in self.states @abstractmethod - def is_state_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=True): + def is_state_healthy(self, consider_drain_as_unhealthy, consider_down_as_unhealthy, log_warn_if_unhealthy=True): """Check if a slurm node's scheduler state is considered healthy.""" pass @@ -394,7 +394,7 @@ def is_bootstrap_timeout(self): pass @abstractmethod - def is_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=True): + def is_healthy(self, consider_drain_as_unhealthy, consider_down_as_unhealthy, log_warn_if_unhealthy=True): """Check if a slurm node is considered healthy.""" pass @@ -478,23 +478,23 @@ def __init__( reservation_name=reservation_name, ) - def is_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=True): + def is_healthy(self, consider_drain_as_unhealthy, consider_down_as_unhealthy, log_warn_if_unhealthy=True): """Check if a slurm node is considered healthy.""" return ( self._is_static_node_ip_configuration_valid(log_warn_if_unhealthy=log_warn_if_unhealthy) and self.is_backing_instance_valid(log_warn_if_unhealthy=log_warn_if_unhealthy) and self.is_state_healthy( - terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=log_warn_if_unhealthy + consider_drain_as_unhealthy, consider_down_as_unhealthy, log_warn_if_unhealthy=log_warn_if_unhealthy ) ) - def is_state_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=True): + def is_state_healthy(self, consider_drain_as_unhealthy, consider_down_as_unhealthy, log_warn_if_unhealthy=True): """Check if a slurm node's scheduler state is considered healthy.""" # Check if node is rebooting: if so, the node is healthy if self.is_rebooting(): return True # Check to see if node is in DRAINED, ignoring any node currently being replaced or in POWER_DOWN - if self.is_drained() and not self.is_power_down() and terminate_drain_nodes: + if self.is_drained() and not self.is_power_down() and consider_drain_as_unhealthy: if self.is_being_replaced: logger.debug( "Node state check: node %s in DRAINED but is currently being replaced, ignoring, node state: %s", @@ -507,7 +507,7 @@ def is_state_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn logger.warning("Node state check: node %s in DRAINED, node state: %s", self, self.state_string) return False # Check to see if node is in DOWN, ignoring any node currently being replaced - elif self.is_down() and terminate_down_nodes: + elif self.is_down() and consider_down_as_unhealthy: if self.is_being_replaced: logger.debug( "Node state check: node %s in DOWN but is currently being replaced, ignoring. Node state: ", @@ -597,18 +597,18 @@ def __init__( reservation_name=reservation_name, ) - def is_state_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=True): + def is_state_healthy(self, consider_drain_as_unhealthy, consider_down_as_unhealthy, log_warn_if_unhealthy=True): """Check if a slurm node's scheduler state is considered healthy.""" # Check if node is rebooting: if so, the node is healthy if self.is_rebooting(): return True # Check to see if node is in DRAINED, ignoring any node currently being replaced or in POWER_DOWN - if self.is_drained() and not self.is_power_down() and terminate_drain_nodes: + if self.is_drained() and not self.is_power_down() and consider_drain_as_unhealthy: if log_warn_if_unhealthy: logger.warning("Node state check: node %s in DRAINED, node state: %s", self, self.state_string) return False # Check to see if node is in DOWN, ignoring any node currently being replaced - elif self.is_down() and terminate_down_nodes: + elif self.is_down() and consider_down_as_unhealthy: if not self.is_nodeaddr_set(): # Silently handle failed to launch dynamic node to clean up normal logging logger.debug("Node state check: node %s in DOWN, node state: %s", self, self.state_string) @@ -618,10 +618,10 @@ def is_state_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn return False return True - def is_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=True): + def is_healthy(self, consider_drain_as_unhealthy, consider_down_as_unhealthy, log_warn_if_unhealthy=True): """Check if a slurm node is considered healthy.""" return self.is_backing_instance_valid(log_warn_if_unhealthy=log_warn_if_unhealthy) and self.is_state_healthy( - terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=log_warn_if_unhealthy + consider_drain_as_unhealthy, consider_down_as_unhealthy, log_warn_if_unhealthy=log_warn_if_unhealthy ) def is_bootstrap_failure(self): diff --git a/tests/slurm_plugin/slurm_resources/test_slurm_resources.py b/tests/slurm_plugin/slurm_resources/test_slurm_resources.py index 89b69b225..0d1291e3b 100644 --- a/tests/slurm_plugin/slurm_resources/test_slurm_resources.py +++ b/tests/slurm_plugin/slurm_resources/test_slurm_resources.py @@ -556,7 +556,7 @@ def test_partition_is_inactive(nodes, expected_output): @pytest.mark.parametrize( - "node, terminate_drain_nodes, terminate_down_nodes, mock_is_node_being_replaced, expected_result", + "node, consider_drain_as_unhealthy, consider_down_as_unhealthy, mock_is_node_being_replaced, expected_result", [ pytest.param( DynamicNode("queue-dy-c5xlarge-1", "some_ip", "hostname", "MIXED+CLOUD", "queue"), @@ -709,10 +709,12 @@ def test_partition_is_inactive(nodes, expected_output): ], ) def test_slurm_node_is_state_healthy( - node, mock_is_node_being_replaced, terminate_drain_nodes, terminate_down_nodes, expected_result + node, mock_is_node_being_replaced, consider_drain_as_unhealthy, consider_down_as_unhealthy, expected_result ): node.is_being_replaced = mock_is_node_being_replaced - assert_that(node.is_state_healthy(terminate_drain_nodes, terminate_down_nodes)).is_equal_to(expected_result) + assert_that(node.is_state_healthy(consider_drain_as_unhealthy, consider_down_as_unhealthy)).is_equal_to( + expected_result + ) @pytest.mark.parametrize( @@ -1005,7 +1007,9 @@ def test_slurm_node_is_bootstrap_failure( ) def test_slurm_node_is_healthy(node, instance, expected_result): node.instance = instance - assert_that(node.is_healthy(terminate_drain_nodes=True, terminate_down_nodes=True)).is_equal_to(expected_result) + assert_that(node.is_healthy(consider_drain_as_unhealthy=True, consider_down_as_unhealthy=True)).is_equal_to( + expected_result + ) @pytest.mark.parametrize( From 54d1b5bfa4bc7dea9eff51262dfe5c76b1499cfb Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Fri, 3 Nov 2023 16:31:21 +0100 Subject: [PATCH 25/35] Improve unit tests, covering missing error cases Signed-off-by: Enrico Usai --- .../test_capacity_block_manager.py | 73 +++++++++++++++++-- 1 file changed, 65 insertions(+), 8 deletions(-) diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 55ac22adb..67e8b575b 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -14,6 +14,7 @@ import pytest from assertpy import assert_that +from common.utils import SlurmCommandError from slurm_plugin.capacity_block_manager import ( SLURM_RESERVATION_NAME_PREFIX, CapacityBlock, @@ -158,7 +159,7 @@ def test_ec2_client(self, capacity_block_manager, mocker): ["queue-cb-st-compute-resource-cb-1"], ), ( - # preserve old value because there is an exception + # preserve old value because there is a managed exception True, ["node1"], { @@ -168,6 +169,17 @@ def test_ec2_client(self, capacity_block_manager, mocker): [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], ["node1"], ), + ( + # preserve old value because there is an unexpected exception + True, + ["node1"], + { + "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), + }, + Exception("generic-error"), + [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + ["node1"], + ), ( True, ["node1"], @@ -217,7 +229,7 @@ def test_get_reserved_nodenames( return_value=capacity_blocks_from_config, ) mocked_client = mocker.MagicMock() - expected_exception = isinstance(capacity_blocks_info_from_ec2, AWSClientError) + expected_exception = isinstance(capacity_blocks_info_from_ec2, Exception) mocked_client.return_value.describe_capacity_reservations.side_effect = [ capacity_blocks_info_from_ec2 if expected_exception else capacity_blocks_info_from_ec2 ] @@ -318,7 +330,13 @@ def test_associate_nodenames_to_capacity_blocks( ) @pytest.mark.parametrize( - ("slurm_reservations", "expected_leftover_slurm_reservations"), + ( + "slurm_reservations", + "delete_reservation_behaviour", + "expected_delete_calls", + "expected_leftover_slurm_reservations", + "expected_error_message", + ), [ ( [ @@ -332,9 +350,24 @@ def test_associate_nodenames_to_capacity_blocks( SlurmReservation( name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-987654", state="active", users="anyone", nodes="node1" ), + # leftover reservation triggering an exception + SlurmReservation( + name=f"{SLURM_RESERVATION_NAME_PREFIX}cr-876543", state="active", users="anyone", nodes="node1" + ), ], + [None, SlurmCommandError("delete error")], + [f"{SLURM_RESERVATION_NAME_PREFIX}cr-987654", f"{SLURM_RESERVATION_NAME_PREFIX}cr-876543"], [f"{SLURM_RESERVATION_NAME_PREFIX}cr-987654"], - ) + f"Unable to delete slurm reservation {SLURM_RESERVATION_NAME_PREFIX}cr-876543. delete error", + ), + ( + # no reservation in the output because of the error retrieving res info + SlurmCommandError("list error"), + [], + [], + [], + "Unable to retrieve list of existing Slurm reservations. list error", + ), ], ) def test_cleanup_leftover_slurm_reservations( @@ -343,21 +376,32 @@ def test_cleanup_leftover_slurm_reservations( capacity_block_manager, capacity_block, slurm_reservations, + delete_reservation_behaviour, + expected_delete_calls, expected_leftover_slurm_reservations, + expected_error_message, + caplog, ): # only cr-123456, queue-cb, compute-resource-cb is in the list of capacity blocks from config capacity_block_manager._capacity_blocks = {"cr-123456": capacity_block} - mocker.patch("slurm_plugin.capacity_block_manager.get_slurm_reservations_info", return_value=slurm_reservations) - delete_res_mock = mocker.patch("slurm_plugin.capacity_block_manager.delete_slurm_reservation") + mocker.patch( + "slurm_plugin.capacity_block_manager.get_slurm_reservations_info", side_effect=[slurm_reservations] + ) + delete_res_mock = mocker.patch( + "slurm_plugin.capacity_block_manager.delete_slurm_reservation", side_effect=delete_reservation_behaviour + ) capacity_block_manager._cleanup_leftover_slurm_reservations() # verify that only the slurm reservation associated with a CB, no longer in the config, # are considered as leftover expected_calls = [] - for slurm_reservation in expected_leftover_slurm_reservations: + for slurm_reservation in expected_delete_calls: expected_calls.append(call(name=slurm_reservation)) delete_res_mock.assert_has_calls(expected_calls) + assert_that(delete_res_mock.call_count).is_equal_to(len(delete_reservation_behaviour)) + assert_that(caplog.text).contains(expected_error_message) + @pytest.mark.parametrize( ( "state", @@ -371,6 +415,7 @@ def test_cleanup_leftover_slurm_reservations( ("pending", True, False, True, False), ("active", False, False, False, False), ("active", True, False, False, True), + ("active", SlurmCommandError("error checking res"), False, False, False), ], ) def test_update_slurm_reservation( @@ -393,7 +438,7 @@ def test_update_slurm_reservation( nodenames = ",".join(capacity_block.nodenames()) slurm_reservation_name = f"{SLURM_RESERVATION_NAME_PREFIX}{FAKE_CAPACITY_BLOCK_ID}" check_res_mock = mocker.patch( - "slurm_plugin.capacity_block_manager.is_slurm_reservation", return_value=reservation_exists + "slurm_plugin.capacity_block_manager.is_slurm_reservation", side_effect=[reservation_exists] ) create_res_mock = mocker.patch("slurm_plugin.capacity_block_manager.create_slurm_reservation") update_res_mock = mocker.patch("slurm_plugin.capacity_block_manager.update_slurm_reservation") @@ -414,18 +459,30 @@ def test_update_slurm_reservation( name=slurm_reservation_name, start_time=expected_start_time, nodes=nodenames ) assert_that(caplog.text).contains(msg_prefix + "Creating" + msg_suffix) + else: + create_res_mock.assert_not_called() if expected_update_res_call: update_res_mock.assert_called_with(name=slurm_reservation_name, nodes=nodenames) assert_that(caplog.text).contains(msg_prefix + "Updating existing" + msg_suffix) + else: + update_res_mock.assert_not_called() # when state is active if expected_delete_res_call: delete_res_mock.assert_called_with(name=slurm_reservation_name) assert_that(caplog.text).contains(msg_prefix + "Deleting" + msg_suffix) + else: + delete_res_mock.assert_not_called() if state == "active" and not reservation_exists: assert_that(caplog.text).contains(msg_prefix + "Nothing to do. No existing" + msg_suffix) + if isinstance(reservation_exists, SlurmCommandError): + assert_that(caplog.text).contains( + f"Unable to update slurm reservation pcluster-{FAKE_CAPACITY_BLOCK_ID} for " + f"Capacity Block {FAKE_CAPACITY_BLOCK_ID}. Skipping it. error checking res" + ) + @pytest.mark.parametrize( ("init_capacity_blocks", "capacity_blocks_info_from_ec2", "expected_error"), [ From cb93e53991bc8c49b402f103e075aade9838c38e Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 6 Nov 2023 10:35:57 +0100 Subject: [PATCH 26/35] Unify CapacityBlockReservationInfo with CapacityReservationInfo and add unit tests Signed-off-by: Enrico Usai --- src/aws/ec2.py | 85 +++++++------------ src/slurm_plugin/capacity_block_manager.py | 8 +- tests/aws/test_ec2.py | 76 +++++++++++++++++ .../test_capacity_block_manager.py | 34 ++++---- 4 files changed, 129 insertions(+), 74 deletions(-) create mode 100644 tests/aws/test_ec2.py diff --git a/src/aws/ec2.py b/src/aws/ec2.py index ce15ed97d..5a94c4f7b 100644 --- a/src/aws/ec2.py +++ b/src/aws/ec2.py @@ -18,20 +18,36 @@ class CapacityReservationInfo: Data object wrapping the result of a describe-capacity-reservations call. { - "CapacityReservationId": "cr-abcdEXAMPLE9876ef ", - "EndDateType": "unlimited", - "AvailabilityZone": "eu-west-1a", - "InstanceMatchCriteria": "open", - "Tags": [], - "EphemeralStorage": false, - "CreateDate": "2019-08-07T11:34:19.000Z", - "AvailableInstanceCount": 3, + "CapacityReservationId": "cr-123456", + "OwnerId": "123", + "CapacityReservationArn": "arn:aws:ec2:us-east-2:123:capacity-reservation/cr-123456", + "AvailabilityZoneId": "use2-az1", + "InstanceType": "t3.large", "InstancePlatform": "Linux/UNIX", - "TotalInstanceCount": 3, - "State": "cancelled", + "AvailabilityZone": "eu-west-1a", "Tenancy": "default", - "EbsOptimized": true, - "InstanceType": "m5.large" + "TotalInstanceCount": 1, + "AvailableInstanceCount": 1, + "EbsOptimized": false, + "EphemeralStorage": false, + "State": "active", + "StartDate": "2023-11-15T11:30:00+00:00", + "EndDate": "2023-11-16T11:30:00+00:00", # capacity-block only + "EndDateType": "limited", + "InstanceMatchCriteria": "targeted", + "CreateDate": "2023-10-25T20:40:13+00:00", + "Tags": [ + { + "Key": "aws:ec2capacityreservation:incrementalRequestedQuantity", + "Value": "1" + }, + { + "Key": "aws:ec2capacityreservation:capacityReservationType", + "Value": "capacity-block" + } + ], + "CapacityAllocations": [], + "ReservationType": "capacity-block" # capacity-block only } """ @@ -46,41 +62,6 @@ def state(self): """Return the state of the Capacity Reservation.""" return self.capacity_reservation_data.get("State") - -class CapacityBlockReservationInfo(CapacityReservationInfo): - """ - Data object wrapping the result of a describe-capacity-reservations --capacity-type capacity-block call. - - { "CapacityReservationId": "cr-a1234567", - "EndDateType": "limited", - "ReservationType": "capacity-block", - "AvailabilityZone": "eu-east-2a", - "InstanceMatchCriteria": "targeted", - "EphemeralStorage": false, - "CreateDate": "2023-07-29T14:22:45Z ", - “StartDate": "2023-08-15T12:00:00Z", - “EndDate": "2023-08-19T12:00:00Z", - "AvailableInstanceCount": 0, - "InstancePlatform": "Linux/UNIX", - "TotalInstanceCount": 16, - “State": "payment-pending", - "Tenancy": "default", - "EbsOptimized": true, - "InstanceType": "p5.48xlarge“ - } - """ - - def __init__(self, capacity_reservation_data): - super().__init__(capacity_reservation_data) - - def start_date(self): - """Return the start date of the CB.""" - return self.capacity_reservation_data.get("StartDate") - - def end_date(self): - """Return the start date of the CB.""" - return self.capacity_reservation_data.get("EndDate") - def __eq__(self, other): return self.__dict__ == other.__dict__ @@ -92,19 +73,17 @@ def __init__(self, config=None): super().__init__("ec2", config=config) @AWSExceptionHandler.handle_client_exception - def describe_capacity_reservations( - self, capacity_reservation_ids: List[str], reservation_type=None - ) -> List[CapacityBlockReservationInfo]: - """Accept a space separated list of ids. Return a list of CapacityReservationInfo.""" + def describe_capacity_reservations(self, capacity_reservation_ids: List[str]) -> List[CapacityReservationInfo]: + """Accept a space separated list of reservation ids. Return a list of CapacityReservationInfo.""" result = [] response = list( self._paginate_results( self._client.describe_capacity_reservations, CapacityReservationIds=capacity_reservation_ids, - ReservationType=reservation_type, + # ReservationType=reservation_type, # not yet available ) ) for capacity_reservation in response: - result.append(CapacityBlockReservationInfo(capacity_reservation)) + result.append(CapacityReservationInfo(capacity_reservation)) return result diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index a24cf83a9..351b1fb43 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -25,7 +25,7 @@ from slurm_plugin.slurm_resources import SlurmNode from aws.common import AWSClientError -from aws.ec2 import CapacityBlockReservationInfo, Ec2Client +from aws.ec2 import CapacityReservationInfo, Ec2Client logger = logging.getLogger(__name__) @@ -66,8 +66,8 @@ def __init__(self, capacity_block_id, queue_name, compute_resource_name): self._capacity_block_reservation_info = None self._nodenames = [] - def update_capacity_block_reservation_info(self, capacity_block_reservation_info: CapacityBlockReservationInfo): - """Update info from CapacityBlockReservationInfo.""" + def update_capacity_block_reservation_info(self, capacity_block_reservation_info: CapacityReservationInfo): + """Update info from CapacityReservationInfo.""" self._capacity_block_reservation_info = capacity_block_reservation_info def slurm_reservation_name(self): @@ -314,7 +314,7 @@ def _update_capacity_blocks_info_from_ec2(self, capacity_blocks: Dict[str, Capac ) try: capacity_block_reservations_info: List[ - CapacityBlockReservationInfo + CapacityReservationInfo ] = self.ec2_client().describe_capacity_reservations(capacity_block_ids) for capacity_block_reservation_info in capacity_block_reservations_info: diff --git a/tests/aws/test_ec2.py b/tests/aws/test_ec2.py new file mode 100644 index 000000000..9773beb39 --- /dev/null +++ b/tests/aws/test_ec2.py @@ -0,0 +1,76 @@ +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import os +from collections import namedtuple + +import pytest +from assertpy import assert_that + +from aws.common import AWSClientError +from aws.ec2 import CapacityReservationInfo, Ec2Client + +MockedBoto3Request = namedtuple( + "MockedBoto3Request", ["method", "response", "expected_params", "generate_error", "error_code"] +) +# Set defaults for attributes of the namedtuple. Since fields with a default value must come after any fields without +# a default, the defaults are applied to the rightmost parameters. In this case generate_error = False and +# error_code = None +MockedBoto3Request.__new__.__defaults__ = (False, None) + + +@pytest.fixture() +def boto3_stubber_path(): + # we need to set the region in the environment because the Boto3ClientFactory requires it. + os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + return "aws.common.boto3" + + +FAKE_CAPACITY_BLOCK_ID = "cr-a1234567" +FAKE_CAPACITY_BLOCK_INFO = { + "CapacityReservationId": FAKE_CAPACITY_BLOCK_ID, + "EndDateType": "limited", + # "ReservationType": "capacity-block", + "AvailabilityZone": "eu-east-2a", + "InstanceMatchCriteria": "targeted", + "EphemeralStorage": False, + "CreateDate": "2023-07-29T14:22:45Z ", + "StartDate": "2023-08-15T12:00:00Z", + "EndDate": "2023-08-19T12:00:00Z", + "AvailableInstanceCount": 0, + "InstancePlatform": "Linux/UNIX", + "TotalInstanceCount": 16, + "State": "payment-pending", + "Tenancy": "default", + "EbsOptimized": True, + "InstanceType": "p5.48xlarge", +} + + +@pytest.mark.parametrize("generate_error", [True, False]) +def test_describe_capacity_reservations(boto3_stubber, generate_error): + """Verify that describe_capacity_reservations behaves as expected.""" + dummy_message = "dummy error message" + mocked_requests = [ + MockedBoto3Request( + method="describe_capacity_reservations", + expected_params={"CapacityReservationIds": [FAKE_CAPACITY_BLOCK_ID]}, + response=dummy_message if generate_error else {"CapacityReservations": [FAKE_CAPACITY_BLOCK_INFO]}, + generate_error=generate_error, + error_code=None, + ) + ] + boto3_stubber("ec2", mocked_requests) + if generate_error: + with pytest.raises(AWSClientError, match=dummy_message): + Ec2Client().describe_capacity_reservations(capacity_reservation_ids=[FAKE_CAPACITY_BLOCK_ID]) + else: + return_value = Ec2Client().describe_capacity_reservations(capacity_reservation_ids=[FAKE_CAPACITY_BLOCK_ID]) + assert_that(return_value).is_equal_to([CapacityReservationInfo(FAKE_CAPACITY_BLOCK_INFO)]) diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 67e8b575b..667fc3190 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -24,7 +24,7 @@ from slurm_plugin.slurm_resources import DynamicNode, SlurmReservation, StaticNode from aws.common import AWSClientError -from aws.ec2 import CapacityBlockReservationInfo +from aws.ec2 import CapacityReservationInfo FAKE_CAPACITY_BLOCK_ID = "cr-a1234567" FAKE_CAPACITY_BLOCK_INFO = { @@ -58,7 +58,7 @@ class TestCapacityBlock: [("active", True), ("anything-else", False)], ) def test_is_active(self, capacity_block, state, expected_output): - capacity_block_reservation_info = CapacityBlockReservationInfo({**FAKE_CAPACITY_BLOCK_INFO, "State": state}) + capacity_block_reservation_info = CapacityReservationInfo({**FAKE_CAPACITY_BLOCK_INFO, "State": state}) capacity_block.update_capacity_block_reservation_info(capacity_block_reservation_info) assert_that(capacity_block.is_active()).is_equal_to(expected_output) @@ -147,11 +147,11 @@ def test_ec2_client(self, capacity_block_manager, mocker): "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), }, [ - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} ), # add another id not in the capacity block to trigger an internal error, that does not stop the loop - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-234567"} ), ], @@ -189,13 +189,13 @@ def test_ec2_client(self, capacity_block_manager, mocker): "cr-345678": CapacityBlock("cr-345678", "queue-cb3", "compute-resource-cb3"), }, [ - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} ), - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-234567"} ), - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-345678"} ), ], @@ -431,7 +431,7 @@ def test_update_slurm_reservation( caplog, ): caplog.set_level(logging.INFO) - capacity_block_reservation_info = CapacityBlockReservationInfo({**FAKE_CAPACITY_BLOCK_INFO, "State": state}) + capacity_block_reservation_info = CapacityReservationInfo({**FAKE_CAPACITY_BLOCK_INFO, "State": state}) capacity_block.update_capacity_block_reservation_info(capacity_block_reservation_info) capacity_block.add_nodename("node1") capacity_block.add_nodename("node2") @@ -494,10 +494,10 @@ def test_update_slurm_reservation( "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), }, [ - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} ), - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} ), ], @@ -509,10 +509,10 @@ def test_update_slurm_reservation( "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), }, [ - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} ), - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} ), ], @@ -525,10 +525,10 @@ def test_update_slurm_reservation( "cr-234567": CapacityBlock("cr-234567", "queue-cb", "compute-resource-cb"), }, [ - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} ), - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} ), ], @@ -568,7 +568,7 @@ def test_update_capacity_blocks_info_from_ec2( assert_that( capacity_block_manager._capacity_blocks.get("cr-123456")._capacity_block_reservation_info ).is_equal_to( - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} ) ) @@ -580,14 +580,14 @@ def test_update_capacity_blocks_info_from_ec2( assert_that( capacity_block_manager._capacity_blocks.get("cr-123456")._capacity_block_reservation_info ).is_equal_to( - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-123456"} ) ) assert_that( capacity_block_manager._capacity_blocks.get("cr-234567")._capacity_block_reservation_info ).is_equal_to( - CapacityBlockReservationInfo( + CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-234567"} ) ) From 7c84360ef5d79a3ea77940295af5105d291d1ddd Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 6 Nov 2023 15:40:05 +0100 Subject: [PATCH 27/35] Add region to Boto3 client initialization Region is required by describe-capacity-reservations call. Signed-off-by: Enrico Usai --- src/aws/common.py | 5 +++-- src/aws/ec2.py | 4 ++-- src/slurm_plugin/capacity_block_manager.py | 2 +- tests/slurm_plugin/test_capacity_block_manager.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/aws/common.py b/src/aws/common.py index 856eccf4c..09b004e95 100644 --- a/src/aws/common.py +++ b/src/aws/common.py @@ -122,8 +122,9 @@ def _log_boto3_calls(params, **kwargs): class Boto3Client: """Boto3 client Class.""" - def __init__(self, client_name: str, config: Config): - self._client = boto3.client(client_name, config=config if config else None) + def __init__(self, client_name: str, config: Config, region: str = None): + region = region if region else get_region() + self._client = boto3.client(client_name, region_name=region, config=config if config else None) self._client.meta.events.register("provide-client-params.*.*", _log_boto3_calls) def _paginate_results(self, method, **kwargs): diff --git a/src/aws/ec2.py b/src/aws/ec2.py index 5a94c4f7b..006398b20 100644 --- a/src/aws/ec2.py +++ b/src/aws/ec2.py @@ -69,8 +69,8 @@ def __eq__(self, other): class Ec2Client(Boto3Client): """Implement EC2 Boto3 client.""" - def __init__(self, config=None): - super().__init__("ec2", config=config) + def __init__(self, config=None, region=None): + super().__init__("ec2", region=region, config=config) @AWSExceptionHandler.handle_client_exception def describe_capacity_reservations(self, capacity_reservation_ids: List[str]) -> List[CapacityReservationInfo]: diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 351b1fb43..83c45ab7c 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -128,7 +128,7 @@ def __init__(self, region, fleet_config, boto3_config): @property def ec2_client(self): if not self._ec2_client: - self._ec2_client = Ec2Client(config=self._boto3_config) + self._ec2_client = Ec2Client(config=self._boto3_config, region=self._region) return self._ec2_client def get_reserved_nodenames(self, nodes: List[SlurmNode]): diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 667fc3190..76e9ac64f 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -107,7 +107,7 @@ def capacity_block_manager(self): def test_ec2_client(self, capacity_block_manager, mocker): ec2_mock = mocker.patch("slurm_plugin.capacity_block_manager.Ec2Client", return_value=mocker.MagicMock()) capacity_block_manager.ec2_client() - ec2_mock.assert_called_with(config="fake_boto3_config") + ec2_mock.assert_called_with(config="fake_boto3_config", region="eu-west-2") capacity_block_manager.ec2_client() ec2_mock.assert_called_once() From c10d89940ca5b46971cf15ced6c5678e5eb0e980 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 6 Nov 2023 15:41:52 +0100 Subject: [PATCH 28/35] Minor logging improvements Do not log errors when checking for Slurm reservation existence Do not log slurm commands Signed-off-by: Enrico Usai --- .../schedulers/slurm_reservation_commands.py | 10 +++++----- src/slurm_plugin/capacity_block_manager.py | 7 ++----- src/slurm_plugin/clustermgtd.py | 16 ++++++++-------- src/slurm_plugin/slurm_resources.py | 8 ++++---- .../test_slurm_reservation_commands.py | 6 ++++-- 5 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/common/schedulers/slurm_reservation_commands.py b/src/common/schedulers/slurm_reservation_commands.py index 8c9ed5d10..01fdb964c 100644 --- a/src/common/schedulers/slurm_reservation_commands.py +++ b/src/common/schedulers/slurm_reservation_commands.py @@ -100,7 +100,7 @@ def create_slurm_reservation( """ cmd = f"{SCONTROL} create reservation" - logger.info("Creating Slurm reservation with command: %s", cmd) + logger.debug("Creating Slurm reservation with command: %s", cmd) _create_or_update_reservation( cmd, name, nodes, partition, user, start_time, duration, number_of_nodes, flags, command_timeout, raise_on_error ) @@ -134,7 +134,7 @@ def update_slurm_reservation( """ cmd = f"{SCONTROL} update" - logger.info("Updating Slurm reservation with command: %s", cmd) + logger.debug("Updating Slurm reservation with command: %s", cmd) _create_or_update_reservation( cmd, name, nodes, partition, user, start_time, duration, number_of_nodes, flags, command_timeout, raise_on_error ) @@ -162,7 +162,7 @@ def delete_slurm_reservation( cmd = f"{SCONTROL} delete reservation" cmd = _add_param(cmd, "ReservationName", name) - logger.info("Deleting Slurm reservation with command: %s", cmd) + logger.debug("Deleting Slurm reservation with command: %s", cmd) run_command(cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True) # nosec B604 @@ -213,7 +213,7 @@ def is_slurm_reservation( try: logger.debug("Retrieving Slurm reservation with command: %s", cmd) output = check_command_output( - cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True + cmd, raise_on_error=raise_on_error, timeout=command_timeout, shell=True, log_error=False ) # nosec B604 reservation_exists = f"ReservationName={name}" in output @@ -222,7 +222,7 @@ def is_slurm_reservation( error = f" Error is: {e.stderr.rstrip()}." if e.stderr else "" output = f" Output is: {e.stdout.rstrip()}." if e.stdout else "" if expected_output in error or expected_output in output: - logger.info(f"Slurm reservation {name} not found.") + logger.debug(f"Slurm reservation {name} not found.") reservation_exists = False else: msg = f"Failed when retrieving Slurm reservation info with command {cmd}.{error}{output} {e}" diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 83c45ab7c..4b0030e2c 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -308,10 +308,7 @@ def _update_capacity_blocks_info_from_ec2(self, capacity_blocks: Dict[str, Capac so when it starts/is restarted or when fleet configuration changes. """ capacity_block_ids = capacity_blocks.keys() - logger.info( - "Retrieving Capacity Block reservation information from EC2 for %s", - ",".join(capacity_block_ids), - ) + logger.info("Retrieving Capacity Blocks information from EC2 for %s", ",".join(capacity_block_ids)) try: capacity_block_reservations_info: List[ CapacityReservationInfo @@ -355,7 +352,7 @@ def _retrieve_capacity_blocks_from_fleet_config(self): } """ capacity_blocks: Dict[str, CapacityBlock] = {} - logger.info("Retrieving Capacity Block reservation information for fleet config.") + logger.info("Retrieving Capacity Blocks from fleet configuration.") for queue_name, queue_config in self._fleet_config.items(): for compute_resource_name, compute_resource_config in queue_config.items(): diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py index 21ccaf856..95604013e 100644 --- a/src/slurm_plugin/clustermgtd.py +++ b/src/slurm_plugin/clustermgtd.py @@ -741,18 +741,18 @@ def _find_unhealthy_slurm_nodes(self, slurm_nodes): ice_compute_resources_and_nodes_map = {} all_unhealthy_nodes = [] - # Remove the nodes part of unactive Capacity Blocks from the list of unhealthy nodes. + # Remove the nodes part of inactive Capacity Blocks from the list of unhealthy nodes. # Nodes from active Capacity Blocks will be instead managed as unhealthy instances. reserved_nodenames = [] if not self._config.disable_capacity_blocks_management: reserved_nodenames = self._capacity_block_manager.get_reserved_nodenames(slurm_nodes) - log.info( - ( - "The nodes %s are associated to unactive Capacity Blocks, " - "they will not be considered as unhealthy nodes." - ), - ",".join(reserved_nodenames), - ) + if reserved_nodenames: + log.info( + "The nodes associated with inactive Capacity Blocks and not considered as unhealthy nodes are: %s", + ",".join(reserved_nodenames), + ) + else: + log.debug("No nodes found associated with inactive Capacity Blocks.") for node in slurm_nodes: if not node.is_healthy( diff --git a/src/slurm_plugin/slurm_resources.py b/src/slurm_plugin/slurm_resources.py index 9fb9ed708..05ab7a41c 100644 --- a/src/slurm_plugin/slurm_resources.py +++ b/src/slurm_plugin/slurm_resources.py @@ -526,7 +526,7 @@ def _is_static_node_ip_configuration_valid(self, log_warn_if_unhealthy=True): if not self.is_nodeaddr_set(): if log_warn_if_unhealthy: logger.warning( - "Node state check: static node without nodeaddr set, node %s, node state %s:", + "Node state check: static node without nodeaddr set, node: %s, node state: %s", self, self.state_string, ) @@ -538,7 +538,7 @@ def is_bootstrap_failure(self): if self.is_static_nodes_in_replacement and not self.is_backing_instance_valid(log_warn_if_unhealthy=False): # Node is currently in replacement and no backing instance logger.warning( - "Node bootstrap error: Node %s is currently in replacement and no backing instance, node state %s:", + "Node bootstrap error: Node %s is currently in replacement and no backing instance, node state: %s", self, self.state_string, ) @@ -546,14 +546,14 @@ def is_bootstrap_failure(self): # Replacement timeout expires for node in replacement elif self.is_bootstrap_timeout(): logger.warning( - "Node bootstrap error: Replacement timeout expires for node %s in replacement, node state %s:", + "Node bootstrap error: Replacement timeout expires for node %s in replacement, node state: %s", self, self.state_string, ) return True elif self.is_failing_health_check and self.is_static_nodes_in_replacement: logger.warning( - "Node bootstrap error: Node %s failed during bootstrap when performing health check, node state %s:", + "Node bootstrap error: Node %s failed during bootstrap when performing health check, node state: %s", self, self.state_string, ) diff --git a/tests/common/schedulers/test_slurm_reservation_commands.py b/tests/common/schedulers/test_slurm_reservation_commands.py index b640a7446..82d0c670e 100644 --- a/tests/common/schedulers/test_slurm_reservation_commands.py +++ b/tests/common/schedulers/test_slurm_reservation_commands.py @@ -454,7 +454,7 @@ def test_is_slurm_reservation( mocker, name, mocked_output, expected_output, expected_message, expected_exception, caplog ): mocker.patch("time.sleep") - caplog.set_level(logging.INFO) + caplog.set_level(logging.DEBUG) run_cmd_mock = mocker.patch( "common.schedulers.slurm_reservation_commands.check_command_output", side_effect=mocked_output ) @@ -468,7 +468,9 @@ def test_is_slurm_reservation( assert_that(caplog.text).contains(expected_message) cmd = f"{SCONTROL} show ReservationName={name}" - run_cmd_mock.assert_called_with(cmd, raise_on_error=True, timeout=DEFAULT_SCONTROL_COMMAND_TIMEOUT, shell=True) + run_cmd_mock.assert_called_with( + cmd, raise_on_error=True, timeout=DEFAULT_SCONTROL_COMMAND_TIMEOUT, shell=True, log_error=False + ) def test_get_slurm_reservations_info(mocker): From 5730e14587f0dcd104a666d9ad20a67c53095592 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Tue, 7 Nov 2023 10:54:19 +0100 Subject: [PATCH 29/35] Fix ec2_client usage It is an object, so it is not callable. Fixing this I found that we were wrongly calling describe_capacity_reservations by passing a dict rather than a list of ids. Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 4 ++-- tests/slurm_plugin/test_capacity_block_manager.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 4b0030e2c..bbd66a715 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -307,12 +307,12 @@ def _update_capacity_blocks_info_from_ec2(self, capacity_blocks: Dict[str, Capac This method is called every time the CapacityBlockManager is re-initialized, so when it starts/is restarted or when fleet configuration changes. """ - capacity_block_ids = capacity_blocks.keys() + capacity_block_ids = list(capacity_blocks.keys()) logger.info("Retrieving Capacity Blocks information from EC2 for %s", ",".join(capacity_block_ids)) try: capacity_block_reservations_info: List[ CapacityReservationInfo - ] = self.ec2_client().describe_capacity_reservations(capacity_block_ids) + ] = self.ec2_client.describe_capacity_reservations(capacity_block_ids) for capacity_block_reservation_info in capacity_block_reservations_info: capacity_block_id = capacity_block_reservation_info.capacity_reservation_id() diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 76e9ac64f..38ff2ece1 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -230,7 +230,7 @@ def test_get_reserved_nodenames( ) mocked_client = mocker.MagicMock() expected_exception = isinstance(capacity_blocks_info_from_ec2, Exception) - mocked_client.return_value.describe_capacity_reservations.side_effect = [ + mocked_client.describe_capacity_reservations.side_effect = [ capacity_blocks_info_from_ec2 if expected_exception else capacity_blocks_info_from_ec2 ] capacity_block_manager._ec2_client = mocked_client @@ -549,7 +549,7 @@ def test_update_capacity_blocks_info_from_ec2( capacity_block_manager._capacity_blocks = init_capacity_blocks expected_exception = isinstance(expected_error, AWSClientError) mocked_client = mocker.MagicMock() - mocked_client.return_value.describe_capacity_reservations.side_effect = [ + mocked_client.describe_capacity_reservations.side_effect = [ expected_error if expected_exception else capacity_blocks_info_from_ec2 ] capacity_block_manager._ec2_client = mocked_client From 4ad77933cae1b55bbd4db0983029c5519a6ad762 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Tue, 7 Nov 2023 11:48:20 +0100 Subject: [PATCH 30/35] Fix slurm reservation creation Start time is required and when adding a start time the duration is required as well. Signed-off-by: Enrico Usai --- .../schedulers/slurm_reservation_commands.py | 16 +++++++++------- src/slurm_plugin/capacity_block_manager.py | 4 +++- .../slurm_plugin/test_capacity_block_manager.py | 2 +- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/common/schedulers/slurm_reservation_commands.py b/src/common/schedulers/slurm_reservation_commands.py index 01fdb964c..edb5d81cb 100644 --- a/src/common/schedulers/slurm_reservation_commands.py +++ b/src/common/schedulers/slurm_reservation_commands.py @@ -14,7 +14,7 @@ # In this file the input of the module subprocess is trusted. import subprocess # nosec B404 from datetime import datetime -from typing import List +from typing import List, Union from common.schedulers.slurm_commands import DEFAULT_SCONTROL_COMMAND_TIMEOUT, SCONTROL from common.utils import ( @@ -42,8 +42,8 @@ def _create_or_update_reservation( nodes: str = None, partition: str = None, user: str = None, - start_time: datetime = None, - duration: int = None, + start_time: Union[datetime, str] = None, # 'now' is accepted as a value + duration: Union[int, str] = None, # 'infinite' is accepted as a value number_of_nodes: int = None, flags: str = None, command_timeout=DEFAULT_SCONTROL_COMMAND_TIMEOUT, @@ -61,6 +61,8 @@ def _create_or_update_reservation( if isinstance(start_time, datetime): # Convert start time to format accepted by slurm command cmd = _add_param(cmd, "starttime", start_time.strftime("%Y-%m-%dT%H:%M:%S")) + elif start_time: + cmd = _add_param(cmd, "starttime", start_time) cmd = _add_param(cmd, "duration", duration) cmd = _add_param(cmd, "nodecnt", number_of_nodes) cmd = _add_param(cmd, "flags", flags) @@ -79,8 +81,8 @@ def create_slurm_reservation( nodes: str = "ALL", partition: str = None, user: str = "slurm", - start_time: datetime = None, - duration: int = None, + start_time: Union[datetime, str] = None, # 'now' is accepted as a value + duration: Union[int, str] = None, # 'infinite' is accepted as a value number_of_nodes: int = None, flags: str = "maint", command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT, @@ -117,8 +119,8 @@ def update_slurm_reservation( nodes: str = None, partition: str = None, user: str = None, - start_time: datetime = None, - duration: int = None, + start_time: Union[datetime, str] = None, # 'now' is accepted as a value + duration: Union[int, str] = None, # 'infinite' is accepted as a value number_of_nodes: int = None, flags: str = None, command_timeout: int = DEFAULT_SCONTROL_COMMAND_TIMEOUT, diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index bbd66a715..5ac8018ba 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -287,10 +287,12 @@ def _log_cb_info(action_info): update_slurm_reservation(name=slurm_reservation_name, nodes=capacity_block_nodenames) else: _log_cb_info("Creating") + # The reservation should start now, and will be removed when the capacity block will become active create_slurm_reservation( name=slurm_reservation_name, - start_time=datetime.now(tz=timezone.utc), + start_time="now", nodes=capacity_block_nodenames, + duration="infinite", ) except SlurmCommandError as e: logger.error( diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 38ff2ece1..b32d47c9e 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -456,7 +456,7 @@ def test_update_slurm_reservation( # when state is != active if expected_create_res_call: create_res_mock.assert_called_with( - name=slurm_reservation_name, start_time=expected_start_time, nodes=nodenames + name=slurm_reservation_name, start_time="now", nodes=nodenames, duration="infinite" ) assert_that(caplog.text).contains(msg_prefix + "Creating" + msg_suffix) else: From b40ff34f94a3935f1cf3b3f911b72475e1563abe Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Tue, 7 Nov 2023 13:31:24 +0100 Subject: [PATCH 31/35] Mark nodes as reserved only if Slurm reservation is correctly created Previously the node was considered as reserved and removed from unhealthy nodes, even if the slurm reservation process was failing. Now we're checking if the slurm reservation has been correctly created/updated. If the slurm reservation actions for ALL the CBs are failing, we're not marking the nodes as reserved and not updating the internal list of capacity blocks and update time. If one of the reservation is updated correctly the list of reserved nodes and update time attributes are updated. Extended unit tests accordingly. Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 27 +++-- .../test_capacity_block_manager.py | 104 +++++++++++++----- 2 files changed, 94 insertions(+), 37 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 5ac8018ba..da3942f5a 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -138,6 +138,7 @@ def get_reserved_nodenames(self, nodes: List[SlurmNode]): now = datetime.now(tz=timezone.utc) if self._is_time_to_update_capacity_blocks_info(now): reserved_nodenames = [] + slurm_reservation_update_errors = 0 # find an updated list of capacity blocks from fleet config capacity_blocks = self._retrieve_capacity_blocks_from_fleet_config() @@ -150,16 +151,21 @@ def get_reserved_nodenames(self, nodes: List[SlurmNode]): # create, update or delete slurm reservation for the nodes according to CB details. for capacity_block in capacity_blocks.values(): - self._update_slurm_reservation(capacity_block) + slurm_reservation_updated = self._update_slurm_reservation(capacity_block) + if not slurm_reservation_updated: + slurm_reservation_update_errors += 1 - # If CB is in not yet active or expired add nodes to list of reserved nodes - if not capacity_block.is_active(): + # If CB is in not yet active or expired add nodes to list of reserved nodes, + # only if slurm reservation has been correctly created/updated + if slurm_reservation_updated and not capacity_block.is_active(): reserved_nodenames.extend(capacity_block.nodenames()) - # Once all the steps have been successful, update object attributes - self._capacity_blocks = capacity_blocks - self._capacity_blocks_update_time = now - self._reserved_nodenames = reserved_nodenames + # If all Slurm reservation actions failed do not update object attributes + if slurm_reservation_update_errors != len(capacity_blocks) or slurm_reservation_update_errors == 0: + # Once all the steps have been successful, update object attributes + self._capacity_blocks = capacity_blocks + self._capacity_blocks_update_time = now + self._reserved_nodenames = reserved_nodenames # delete slurm reservations created by CapacityBlockManager not associated to existing capacity blocks self._cleanup_leftover_slurm_reservations() @@ -251,6 +257,8 @@ def _update_slurm_reservation(capacity_block: CapacityBlock): A CB has five possible states: payment-pending, pending, active, expired and payment-failed, we need to create/delete Slurm reservation accordingly. + + Returns True if slurm action is completed correctly, False otherwise. """ def _log_cb_info(action_info): @@ -294,6 +302,8 @@ def _log_cb_info(action_info): nodes=capacity_block_nodenames, duration="infinite", ) + + action_completed = True except SlurmCommandError as e: logger.error( "Unable to update slurm reservation %s for Capacity Block %s. Skipping it. %s", @@ -301,6 +311,9 @@ def _log_cb_info(action_info): capacity_block.capacity_block_id, e, ) + action_completed = False + + return action_completed def _update_capacity_blocks_info_from_ec2(self, capacity_blocks: Dict[str, CapacityBlock]): """ diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index b32d47c9e..c4e10ee35 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -118,6 +118,7 @@ def test_ec2_client(self, capacity_block_manager, mocker): "capacity_blocks_from_config", "capacity_blocks_info_from_ec2", "nodes", + "slurm_reservation_creation_succeeded", "expected_reserved_nodenames", ), [ @@ -128,6 +129,7 @@ def test_ec2_client(self, capacity_block_manager, mocker): {}, [], [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + [True], ["node1"], ), ( @@ -137,50 +139,81 @@ def test_ec2_client(self, capacity_block_manager, mocker): {}, [], [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + [True], [], # empty because capacity block from config is empty ), ( - # update values because there is only an internal error + # return first nodename because there is an internal error + # in the second capacity block when updating info from EC2 True, ["node1"], - { - "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), - }, + {"cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb")}, [ CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} ), - # add another id not in the capacity block to trigger an internal error, that does not stop the loop + # add another id not in the capacity block to trigger an internal error + # for _update_capacity_blocks_info_from_ec2, that does not stop the loop CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-234567"} ), ], [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + [True, True], ["queue-cb-st-compute-resource-cb-1"], ), ( - # preserve old value because there is a managed exception + # return first nodename because there is an internal error + # in the second capacity block when creating Slurm reservation True, ["node1"], - { - "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), - }, + {"cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb")}, + [ + CapacityReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} + ) + ], + [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + [True, False], # reservation creation failed for the first CB + ["queue-cb-st-compute-resource-cb-1"], + ), + ( + # preserve original list of nodenames because there are internal errors + # while creating all the slurm reservations + True, + ["node1"], + {"cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb")}, + [ + CapacityReservationInfo( + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} + ), + ], + [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + [False, False], # reservation creation failed for all the CBs + ["node1"], + ), + ( + # preserve old nodenames because there is a managed exception when contacting EC2 + True, + ["node1"], + {"cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb")}, AWSClientError("describe_capacity_reservations", "Boto3Error"), [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + [False], ["node1"], ), ( - # preserve old value because there is an unexpected exception + # preserve old nodenames because there is an unexpected exception when contacting EC2 True, ["node1"], - { - "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), - }, + {"cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb")}, Exception("generic-error"), [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], + [False], ["node1"], ), ( + # return nodenames accordingly to the state (only pending) True, ["node1"], { @@ -196,7 +229,7 @@ def test_ec2_client(self, capacity_block_manager, mocker): {**FAKE_CAPACITY_BLOCK_INFO, "State": "active", "CapacityReservationId": "cr-234567"} ), CapacityReservationInfo( - {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-345678"} + {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending-payment", "CapacityReservationId": "cr-345678"} ), ], [ @@ -204,6 +237,7 @@ def test_ec2_client(self, capacity_block_manager, mocker): DynamicNode("queue-cb2-dy-compute-resource-cb2-1", "ip-1", "hostname-1", "some_state", "queue-cb2"), DynamicNode("queue-cb3-dy-compute-resource-cb3-1", "ip-1", "hostname-1", "some_state", "queue-cb3"), ], + [True, True, True], ["queue-cb-st-compute-resource-cb-1", "queue-cb3-dy-compute-resource-cb3-1"], ), ], @@ -217,6 +251,7 @@ def test_get_reserved_nodenames( capacity_blocks_from_config, capacity_blocks_info_from_ec2, nodes, + slurm_reservation_creation_succeeded, expected_reserved_nodenames, caplog, ): @@ -229,19 +264,21 @@ def test_get_reserved_nodenames( return_value=capacity_blocks_from_config, ) mocked_client = mocker.MagicMock() - expected_exception = isinstance(capacity_blocks_info_from_ec2, Exception) - mocked_client.describe_capacity_reservations.side_effect = [ - capacity_blocks_info_from_ec2 if expected_exception else capacity_blocks_info_from_ec2 - ] + expected_ec2_exception = isinstance(capacity_blocks_info_from_ec2, Exception) + mocked_client.describe_capacity_reservations.side_effect = [capacity_blocks_info_from_ec2] capacity_block_manager._ec2_client = mocked_client - update_res_mock = mocker.patch.object(capacity_block_manager, "_update_slurm_reservation") + update_res_mock = mocker.patch.object( + capacity_block_manager, "_update_slurm_reservation", side_effect=slurm_reservation_creation_succeeded + ) cleanup_mock = mocker.patch.object(capacity_block_manager, "_cleanup_leftover_slurm_reservations") capacity_block_manager._reserved_nodenames = previous_reserved_nodenames previous_update_time = datetime(2020, 1, 1, 0, 0, 0) capacity_block_manager._capacity_blocks_update_time = previous_update_time reserved_nodenames = capacity_block_manager.get_reserved_nodenames(nodes) - if expected_exception: + assert_that(capacity_block_manager._reserved_nodenames).is_equal_to(expected_reserved_nodenames) + + if expected_ec2_exception: assert_that(caplog.text).contains("Unable to retrieve list of reserved nodes, maintaining old list") else: expected_internal_error = len(capacity_blocks_from_config) != len(capacity_blocks_info_from_ec2) @@ -251,16 +288,20 @@ def test_get_reserved_nodenames( else: assert_that(reserved_nodenames).is_equal_to(expected_reserved_nodenames) - assert_that(capacity_block_manager._reserved_nodenames).is_equal_to(expected_reserved_nodenames) + if is_time_to_update and not expected_ec2_exception: + if all(not action_succeeded for action_succeeded in slurm_reservation_creation_succeeded): + # if all slurm reservations actions failed, do not update the values + assert_that(capacity_block_manager._capacity_blocks_update_time).is_equal_to(previous_update_time) + assert_that(capacity_block_manager._capacity_blocks).is_not_equal_to(capacity_blocks_from_config) + else: + assert_that(capacity_block_manager._capacity_blocks_update_time).is_not_equal_to(previous_update_time) + assert_that(capacity_block_manager._capacity_blocks).is_equal_to(capacity_blocks_from_config) - if is_time_to_update and not expected_exception: - assert_that(capacity_block_manager._capacity_blocks_update_time).is_not_equal_to(previous_update_time) update_res_mock.assert_has_calls( [call(capacity_block) for capacity_block in capacity_block_manager._capacity_blocks.values()], any_order=True, ) cleanup_mock.assert_called_once() - assert_that(capacity_block_manager._capacity_blocks).is_equal_to(capacity_blocks_from_config) else: assert_that(capacity_block_manager._capacity_blocks_update_time).is_equal_to(previous_update_time) update_res_mock.assert_not_called() @@ -409,13 +450,14 @@ def test_cleanup_leftover_slurm_reservations( "expected_create_res_call", "expected_update_res_call", "expected_delete_res_call", + "expected_output", ), [ - ("pending", False, True, False, False), - ("pending", True, False, True, False), - ("active", False, False, False, False), - ("active", True, False, False, True), - ("active", SlurmCommandError("error checking res"), False, False, False), + ("pending", False, True, False, False, True), + ("pending", True, False, True, False, True), + ("active", False, False, False, False, True), + ("active", True, False, False, True, True), + ("active", SlurmCommandError("error checking res"), False, False, False, False), ], ) def test_update_slurm_reservation( @@ -428,6 +470,7 @@ def test_update_slurm_reservation( expected_create_res_call, expected_update_res_call, expected_delete_res_call, + expected_output, caplog, ): caplog.set_level(logging.INFO) @@ -446,7 +489,8 @@ def test_update_slurm_reservation( expected_start_time = datetime(2020, 1, 1, 0, 0, 0) mocker.patch("slurm_plugin.capacity_block_manager.datetime").now.return_value = expected_start_time - capacity_block_manager._update_slurm_reservation(capacity_block) + output_value = capacity_block_manager._update_slurm_reservation(capacity_block) + assert_that(expected_output).is_equal_to(output_value) # check the right commands to create/delete/update reservations are called accordingly to the state check_res_mock.assert_called_with(name=slurm_reservation_name) From 303dbf6728203eb4bfaa21e7a9788d48cb97d18a Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Tue, 7 Nov 2023 18:05:17 +0100 Subject: [PATCH 32/35] Avoid to call update_slurm_reservation when it is not required Previously we were calling scontrol command to update the reservation every 10 minutes, but this is useless. The only case where we need to update the reservation is to update the list of nodes, this can happen only at cluster update time, so when Clustermgtd is restarted. To identify this moment we are using the `_capacity_blocks_update_time` attribute. Defined a new `_is_initialized` method and using it for `_update_slurm_reservation` Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 26 ++++++++++----- .../test_capacity_block_manager.py | 33 +++++++++++++++---- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index da3942f5a..7a1464d90 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -131,6 +131,10 @@ def ec2_client(self): self._ec2_client = Ec2Client(config=self._boto3_config, region=self._region) return self._ec2_client + def _is_initialized(self): + """Return true if Capacity Block Manager has been already initialized.""" + return self._capacity_blocks_update_time is not None + def get_reserved_nodenames(self, nodes: List[SlurmNode]): """Manage nodes part of capacity block reservation. Returns list of reserved nodes.""" try: @@ -151,7 +155,9 @@ def get_reserved_nodenames(self, nodes: List[SlurmNode]): # create, update or delete slurm reservation for the nodes according to CB details. for capacity_block in capacity_blocks.values(): - slurm_reservation_updated = self._update_slurm_reservation(capacity_block) + slurm_reservation_updated = self._update_slurm_reservation( + capacity_block=capacity_block, do_update=not self._is_initialized() + ) if not slurm_reservation_updated: slurm_reservation_update_errors += 1 @@ -193,10 +199,8 @@ def _is_time_to_update_capacity_blocks_info(self, now: datetime): and every 10 minutes. # TODO: evaluate time to update accordingly to capacity block start time """ - return self._capacity_blocks_update_time is None or ( - self._capacity_blocks_update_time - and seconds_to_minutes((now - self._capacity_blocks_update_time).total_seconds()) - > CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD + return not self._is_initialized() or seconds_to_minutes( + (now - self._capacity_blocks_update_time).total_seconds() > CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD ) @staticmethod @@ -251,13 +255,16 @@ def _cleanup_leftover_slurm_reservations(self): logger.error("Unable to retrieve list of existing Slurm reservations. %s", e) @staticmethod - def _update_slurm_reservation(capacity_block: CapacityBlock): + def _update_slurm_reservation(capacity_block: CapacityBlock, do_update: bool): """ Update Slurm reservation associated to the given Capacity Block. A CB has five possible states: payment-pending, pending, active, expired and payment-failed, we need to create/delete Slurm reservation accordingly. + Pass do_update to True only if you want to update already created reservations + (e.g. to update the node list or when the CapacityBlockManager is not yet initialized). + Returns True if slurm action is completed correctly, False otherwise. """ @@ -291,8 +298,11 @@ def _log_cb_info(action_info): else: # create or update Slurm reservation if reservation_exists: - _log_cb_info("Updating existing") - update_slurm_reservation(name=slurm_reservation_name, nodes=capacity_block_nodenames) + if do_update: + _log_cb_info("Updating existing") + update_slurm_reservation(name=slurm_reservation_name, nodes=capacity_block_nodenames) + else: + _log_cb_info("Nothing to do. Already existing") else: _log_cb_info("Creating") # The reservation should start now, and will be removed when the capacity block will become active diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index c4e10ee35..6611014d9 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -104,6 +104,14 @@ class TestCapacityBlockManager: def capacity_block_manager(self): return CapacityBlockManager("eu-west-2", {}, "fake_boto3_config") + @pytest.mark.parametrize( + ("update_time", "expected_output"), + [(None, False), ("any-value", True), (datetime(2020, 1, 1, 0, 0, 0), True)], + ) + def test_is_initialized(self, capacity_block_manager, update_time, expected_output): + capacity_block_manager._capacity_blocks_update_time = update_time + assert_that(capacity_block_manager._is_initialized()).is_equal_to(expected_output) + def test_ec2_client(self, capacity_block_manager, mocker): ec2_mock = mocker.patch("slurm_plugin.capacity_block_manager.Ec2Client", return_value=mocker.MagicMock()) capacity_block_manager.ec2_client() @@ -298,7 +306,10 @@ def test_get_reserved_nodenames( assert_that(capacity_block_manager._capacity_blocks).is_equal_to(capacity_blocks_from_config) update_res_mock.assert_has_calls( - [call(capacity_block) for capacity_block in capacity_block_manager._capacity_blocks.values()], + [ + call(capacity_block=capacity_block, do_update=False) + for capacity_block in capacity_block_manager._capacity_blocks.values() + ], any_order=True, ) cleanup_mock.assert_called_once() @@ -447,17 +458,24 @@ def test_cleanup_leftover_slurm_reservations( ( "state", "reservation_exists", + "do_update", "expected_create_res_call", "expected_update_res_call", "expected_delete_res_call", "expected_output", ), [ - ("pending", False, True, False, False, True), - ("pending", True, False, True, False, True), - ("active", False, False, False, False, True), - ("active", True, False, False, True, True), - ("active", SlurmCommandError("error checking res"), False, False, False, False), + # Not existing reservation, do_update is useless + ("pending", False, None, True, False, False, True), + # Existing reservation, update_res is called accordingly to do_update value + ("pending", True, True, False, True, False, True), + ("pending", True, False, False, False, False, True), + # Not existing reservation and CB in active state, do_update is useless + ("active", False, None, False, False, False, True), + # Existing reservation and CB in active state, do_update is useless + ("active", True, None, False, False, True, True), + # Error, do_update is useless + ("active", SlurmCommandError("error checking res"), None, False, False, False, False), ], ) def test_update_slurm_reservation( @@ -467,6 +485,7 @@ def test_update_slurm_reservation( capacity_block, state, reservation_exists, + do_update, expected_create_res_call, expected_update_res_call, expected_delete_res_call, @@ -489,7 +508,7 @@ def test_update_slurm_reservation( expected_start_time = datetime(2020, 1, 1, 0, 0, 0) mocker.patch("slurm_plugin.capacity_block_manager.datetime").now.return_value = expected_start_time - output_value = capacity_block_manager._update_slurm_reservation(capacity_block) + output_value = capacity_block_manager._update_slurm_reservation(capacity_block, do_update) assert_that(expected_output).is_equal_to(output_value) # check the right commands to create/delete/update reservations are called accordingly to the state From d3ca87f4dc9dc0ddb839bc6921fd3cb9b054e10e Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Tue, 7 Nov 2023 18:39:19 +0100 Subject: [PATCH 33/35] Fix is_time_to_update_capacity_blocks_info method Previously we were checking if `seconds_to_minutes()` was `True`. Now we're correctly checking if `seconds_to_minutes` is `> CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD` Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 8 ++++--- .../test_capacity_block_manager.py | 24 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index 7a1464d90..f36ad9b36 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -191,7 +191,7 @@ def get_reserved_nodenames(self, nodes: List[SlurmNode]): return self._reserved_nodenames - def _is_time_to_update_capacity_blocks_info(self, now: datetime): + def _is_time_to_update_capacity_blocks_info(self, current_time: datetime): """ Return true if it's the moment to update capacity blocks info, from ec2 and from config. @@ -199,8 +199,10 @@ def _is_time_to_update_capacity_blocks_info(self, now: datetime): and every 10 minutes. # TODO: evaluate time to update accordingly to capacity block start time """ - return not self._is_initialized() or seconds_to_minutes( - (now - self._capacity_blocks_update_time).total_seconds() > CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD + return ( + not self._is_initialized() + or seconds_to_minutes((current_time - self._capacity_blocks_update_time).total_seconds()) + > CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD ) @staticmethod diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 6611014d9..1b445fc79 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -319,27 +319,25 @@ def test_get_reserved_nodenames( cleanup_mock.assert_not_called() @pytest.mark.parametrize( - ("previous_capacity_blocks_update_time", "expected_update_time"), + ("is_initialized", "now", "expected_updated_time"), [ # manager not initialized - (None, True), + (False, datetime(2020, 1, 1, 1, 00, 0), True), # delta < CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD - (datetime(2020, 1, 2, 1, 51, 0), False), - (datetime(2020, 1, 2, 1, 50, 0), False), + (True, datetime(2020, 1, 1, 1, 00, 10), False), + (True, datetime(2020, 1, 1, 1, 9, 0), False), # delta >= CAPACITY_BLOCK_RESERVATION_UPDATE_PERIOD - (datetime(2020, 1, 2, 1, 40, 0), True), - (datetime(2020, 1, 2, 0, 51, 0), True), - (datetime(2020, 1, 1, 0, 51, 0), True), + (True, datetime(2020, 1, 1, 1, 10, 0), True), + (True, datetime(2020, 1, 1, 1, 21, 0), True), + (True, datetime(2020, 1, 1, 2, 00, 0), True), + (True, datetime(2020, 1, 2, 1, 00, 0), True), ], ) def test_is_time_to_update_capacity_blocks_info( - self, mocker, capacity_block_manager, previous_capacity_blocks_update_time, expected_update_time + self, capacity_block_manager, is_initialized, now, expected_updated_time ): - mocked_now = datetime(2020, 1, 2, 1, 51, 0) - mocker.patch("slurm_plugin.capacity_block_manager.datetime").now.return_value = mocked_now - - capacity_block_manager._capacity_blocks_update_time = previous_capacity_blocks_update_time - assert_that(capacity_block_manager._is_time_to_update_capacity_blocks_info(mocked_now)) + capacity_block_manager._capacity_blocks_update_time = datetime(2020, 1, 1, 1, 00, 0) if is_initialized else None + assert_that(capacity_block_manager._is_time_to_update_capacity_blocks_info(now)) @pytest.mark.parametrize( ("capacity_blocks", "nodes", "expected_nodenames_in_capacity_block"), From e83b3f37c0632162aa461e3fd3d787090db3da47 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 8 Nov 2023 10:55:01 +0100 Subject: [PATCH 34/35] Do not print warning messages for reserved nodes In the log there was an entry for reserved instances saying: ``` WARNING - Node state check: static node without nodeaddr set, node queue1-st-p5-1(queue1-st-p5-1), node state IDLE+CLOUD+MAINTENANCE+POWERED_DOWN+RESERVED ``` I'm removing the print for the reserved nodes. Signed-off-by: Enrico Usai --- src/slurm_plugin/clustermgtd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py index 95604013e..6d404fde6 100644 --- a/src/slurm_plugin/clustermgtd.py +++ b/src/slurm_plugin/clustermgtd.py @@ -758,6 +758,7 @@ def _find_unhealthy_slurm_nodes(self, slurm_nodes): if not node.is_healthy( consider_drain_as_unhealthy=self._config.terminate_drain_nodes, consider_down_as_unhealthy=self._config.terminate_down_nodes, + log_warn_if_unhealthy=node.name not in reserved_nodenames, ): if not self._config.disable_capacity_blocks_management and node.name in reserved_nodenames: # do not consider as unhealthy the nodes reserved for capacity blocks From 298cac3ad2f4b73b84a286d09f8d3e72fed9b9c9 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 9 Nov 2023 12:49:31 +0100 Subject: [PATCH 35/35] Support same CB in multiple queues and compute resources Previously every CB was associated to a single queue and compute resource. Now the queue/compute-resource association is stored in an internal map. Signed-off-by: Enrico Usai --- src/slurm_plugin/capacity_block_manager.py | 32 ++- .../test_capacity_block_manager.py | 201 ++++++++++++++---- 2 files changed, 184 insertions(+), 49 deletions(-) diff --git a/src/slurm_plugin/capacity_block_manager.py b/src/slurm_plugin/capacity_block_manager.py index f36ad9b36..e70cae288 100644 --- a/src/slurm_plugin/capacity_block_manager.py +++ b/src/slurm_plugin/capacity_block_manager.py @@ -54,15 +54,14 @@ class CapacityBlock: Class to store Capacity Block info from EC2 and fleet config. Contains info like: - - queue and compute resource from the config, + - associated queue and compute resources from the config, - state from EC2, - name of the slurm reservation that will be created for this CB. """ - def __init__(self, capacity_block_id, queue_name, compute_resource_name): + def __init__(self, capacity_block_id): self.capacity_block_id = capacity_block_id - self.queue_name = queue_name - self.compute_resource_name = compute_resource_name + self.compute_resources_map: Dict[str, set] = {} self._capacity_block_reservation_info = None self._nodenames = [] @@ -74,6 +73,16 @@ def slurm_reservation_name(self): """Retrieve slurm reservation associated.""" return f"{SLURM_RESERVATION_NAME_PREFIX}{self.capacity_block_id}" + def add_compute_resource(self, queue_name: str, compute_resource_name: str): + """ + Add compute resource to the internal map. + + The same CB can be associated to multiple queues/compute-resources. + """ + compute_resources_by_queue = self.compute_resources_map.get(queue_name, set()) + compute_resources_by_queue.add(compute_resource_name) + self.compute_resources_map.update({queue_name: compute_resources_by_queue}) + def add_nodename(self, nodename: str): """Add node name to the list of nodenames associated to the capacity block.""" self._nodenames.append(nodename) @@ -92,7 +101,7 @@ def is_active(self): def does_node_belong_to(self, node): """Return true if the node belongs to the CB.""" - return node.queue_name == self.queue_name and node.compute_resource_name == self.compute_resource_name + return node.compute_resource_name in self.compute_resources_map.get(node.queue_name, set()) @staticmethod def capacity_block_id_from_slurm_reservation_name(slurm_reservation_name: str): @@ -107,8 +116,7 @@ def is_capacity_block_slurm_reservation(slurm_reservation_name: str): def __eq__(self, other): return ( self.capacity_block_id == other.capacity_block_id - and self.queue_name == other.queue_name - and self.compute_resource_name == other.compute_resource_name + and self.compute_resources_map == other.compute_resources_map ) @@ -387,10 +395,12 @@ def _retrieve_capacity_blocks_from_fleet_config(self): capacity_block_id = self._capacity_reservation_id_from_compute_resource_config( compute_resource_config ) - capacity_block = CapacityBlock( - capacity_block_id=capacity_block_id, - queue_name=queue_name, - compute_resource_name=compute_resource_name, + # retrieve existing CapacityBlock if exists or create a new one. + capacity_block = capacity_blocks.get( + capacity_block_id, CapacityBlock(capacity_block_id=capacity_block_id) + ) + capacity_block.add_compute_resource( + queue_name=queue_name, compute_resource_name=compute_resource_name ) capacity_blocks.update({capacity_block_id: capacity_block}) diff --git a/tests/slurm_plugin/test_capacity_block_manager.py b/tests/slurm_plugin/test_capacity_block_manager.py index 1b445fc79..6a54b8aed 100644 --- a/tests/slurm_plugin/test_capacity_block_manager.py +++ b/tests/slurm_plugin/test_capacity_block_manager.py @@ -10,6 +10,8 @@ # limitations under the License. import logging from datetime import datetime +from types import SimpleNamespace +from typing import Dict from unittest.mock import call import pytest @@ -49,7 +51,9 @@ @pytest.fixture def capacity_block(): - return CapacityBlock(FAKE_CAPACITY_BLOCK_ID, "queue-cb", "compute-resource-cb") + capacity_block = CapacityBlock(FAKE_CAPACITY_BLOCK_ID) + capacity_block.add_compute_resource("queue-cb", "compute-resource-cb") + return capacity_block class TestCapacityBlock: @@ -62,6 +66,21 @@ def test_is_active(self, capacity_block, state, expected_output): capacity_block.update_capacity_block_reservation_info(capacity_block_reservation_info) assert_that(capacity_block.is_active()).is_equal_to(expected_output) + @pytest.mark.parametrize( + ("initial_map", "queue", "compute_resource", "expected_map"), + [ + ({}, "queue1", "cr1", {"queue1": {"cr1"}}), + ({"queue1": {"cr1"}}, "queue1", "cr2", {"queue1": {"cr1", "cr2"}}), + ({"queue1": {"cr1"}}, "queue2", "cr2", {"queue1": {"cr1"}, "queue2": {"cr2"}}), + ({"queue1": {"cr1"}}, "queue2", "cr1", {"queue1": {"cr1"}, "queue2": {"cr1"}}), + ], + ) + def test_add_compute_resource(self, initial_map, queue, compute_resource, expected_map): + capacity_block = CapacityBlock("id") + capacity_block.compute_resources_map = initial_map + capacity_block.add_compute_resource(queue, compute_resource) + assert_that(capacity_block.compute_resources_map).is_equal_to(expected_map) + @pytest.mark.parametrize( "nodes_to_add", [(["node1"]), (["node1", "node2"])], @@ -75,8 +94,10 @@ def test_add_nodename(self, capacity_block, nodes_to_add): ("node", "expected_output"), [ (StaticNode("queue-cb-st-compute-resource-cb-4", "ip-1", "hostname-1", "some_state", "queue-cb"), True), + (StaticNode("queue-cb-st-compute-resource-other-2", "ip-1", "hostname-1", "some_state", "queue-cb"), False), (StaticNode("queue1-st-c5xlarge-4", "ip-1", "hostname-1", "some_state", "queue1"), False), (StaticNode("queue2-st-compute-resource1-4", "ip-1", "hostname-1", "some_state", "queue2"), False), + (StaticNode("queue2-st-compute-resource-cb-2", "ip-1", "hostname-1", "some_state", "queue2"), False), ], ) def test_does_node_belong_to(self, capacity_block, node, expected_output): @@ -155,7 +176,11 @@ def test_ec2_client(self, capacity_block_manager, mocker): # in the second capacity block when updating info from EC2 True, ["node1"], - {"cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb")}, + { + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ) + }, [ CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} @@ -175,14 +200,18 @@ def test_ec2_client(self, capacity_block_manager, mocker): # in the second capacity block when creating Slurm reservation True, ["node1"], - {"cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb")}, + { + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ) + }, [ CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} ) ], [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], - [True, False], # reservation creation failed for the first CB + [True, False], # reservation creation failed for the second CB ["queue-cb-st-compute-resource-cb-1"], ), ( @@ -190,7 +219,11 @@ def test_ec2_client(self, capacity_block_manager, mocker): # while creating all the slurm reservations True, ["node1"], - {"cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb")}, + { + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ) + }, [ CapacityReservationInfo( {**FAKE_CAPACITY_BLOCK_INFO, "State": "pending", "CapacityReservationId": "cr-123456"} @@ -204,7 +237,11 @@ def test_ec2_client(self, capacity_block_manager, mocker): # preserve old nodenames because there is a managed exception when contacting EC2 True, ["node1"], - {"cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb")}, + { + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ) + }, AWSClientError("describe_capacity_reservations", "Boto3Error"), [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], [False], @@ -214,20 +251,30 @@ def test_ec2_client(self, capacity_block_manager, mocker): # preserve old nodenames because there is an unexpected exception when contacting EC2 True, ["node1"], - {"cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb")}, + { + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ) + }, Exception("generic-error"), [StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb")], [False], ["node1"], ), ( - # return nodenames accordingly to the state (only pending) + # return nodenames accordingly to the state of the CB (only not active CBs) True, ["node1"], { - "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), - "cr-234567": CapacityBlock("cr-234567", "queue-cb2", "compute-resource-cb2"), - "cr-345678": CapacityBlock("cr-345678", "queue-cb3", "compute-resource-cb3"), + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ), + "cr-234567": SimpleNamespace( + capacity_block_id="cr-234567", compute_resources_map={"queue-cb2": {"compute-resource-cb2"}} + ), + "cr-345678": SimpleNamespace( + capacity_block_id="cr-345678", compute_resources_map={"queue-cb3": {"compute-resource-cb3"}} + ), }, [ CapacityReservationInfo( @@ -266,10 +313,11 @@ def test_get_reserved_nodenames( mocker.patch.object( capacity_block_manager, "_is_time_to_update_capacity_blocks_info", return_value=is_time_to_update ) + mocked_capacity_blocks_from_config = self._build_capacity_blocks(capacity_blocks_from_config) mocker.patch.object( capacity_block_manager, "_retrieve_capacity_blocks_from_fleet_config", - return_value=capacity_blocks_from_config, + return_value=mocked_capacity_blocks_from_config, ) mocked_client = mocker.MagicMock() expected_ec2_exception = isinstance(capacity_blocks_info_from_ec2, Exception) @@ -300,10 +348,10 @@ def test_get_reserved_nodenames( if all(not action_succeeded for action_succeeded in slurm_reservation_creation_succeeded): # if all slurm reservations actions failed, do not update the values assert_that(capacity_block_manager._capacity_blocks_update_time).is_equal_to(previous_update_time) - assert_that(capacity_block_manager._capacity_blocks).is_not_equal_to(capacity_blocks_from_config) + assert_that(capacity_block_manager._capacity_blocks).is_not_equal_to(mocked_capacity_blocks_from_config) else: assert_that(capacity_block_manager._capacity_blocks_update_time).is_not_equal_to(previous_update_time) - assert_that(capacity_block_manager._capacity_blocks).is_equal_to(capacity_blocks_from_config) + assert_that(capacity_block_manager._capacity_blocks).is_equal_to(mocked_capacity_blocks_from_config) update_res_mock.assert_has_calls( [ @@ -342,10 +390,29 @@ def test_is_time_to_update_capacity_blocks_info( @pytest.mark.parametrize( ("capacity_blocks", "nodes", "expected_nodenames_in_capacity_block"), [ + # empty initial list + ( + {}, + [ + StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb"), + StaticNode("queue1-st-compute-resource1-2", "ip-1", "hostname-1", "some_state", "queue1"), + StaticNode("queue-cb-st-compute-resource-cb-3", "ip-1", "hostname-1", "some_state", "queue-cb"), + StaticNode("queue1-st-compute-resource1-4", "ip-1", "hostname-1", "some_state", "queue1"), + StaticNode("queue-cb2-st-compute-resource-cb2-5", "ip-1", "hostname-1", "some_state", "queue-cb2"), + StaticNode("queue-cb-st-othercr-6", "ip-1", "hostname-1", "some_state", "queue1"), + StaticNode("otherqueue-st-compute-resource-cb-7", "ip-1", "hostname-1", "some_state", "otherqueue"), + ], + {}, + ), + # multiple CBs ( { - "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), - "cr-234567": CapacityBlock("id2", "queue-cb2", "compute-resource-cb2"), + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ), + "cr-234567": SimpleNamespace( + capacity_block_id="cr-234567", compute_resources_map={"queue-cb2": {"compute-resource-cb2"}} + ), }, [ StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb"), @@ -360,7 +427,27 @@ def test_is_time_to_update_capacity_blocks_info( "cr-123456": ["queue-cb-st-compute-resource-cb-1", "queue-cb-st-compute-resource-cb-3"], "cr-234567": ["queue-cb2-st-compute-resource-cb2-5"], }, - ) + ), + # same CB used by multiple queues/compute resources + ( + { + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ), + }, + [ + StaticNode("queue-cb-st-compute-resource-cb-1", "ip-1", "hostname-1", "some_state", "queue-cb"), + StaticNode("queue1-st-compute-resource1-2", "ip-1", "hostname-1", "some_state", "queue1"), + StaticNode("queue-cb-st-compute-resource-cb-3", "ip-1", "hostname-1", "some_state", "queue-cb"), + StaticNode("queue1-st-compute-resource1-4", "ip-1", "hostname-1", "some_state", "queue1"), + StaticNode("queue-cb2-st-compute-resource-cb2-5", "ip-1", "hostname-1", "some_state", "queue-cb2"), + StaticNode("queue-cb-st-othercr-6", "ip-1", "hostname-1", "some_state", "queue1"), + StaticNode("otherqueue-st-compute-resource-cb-7", "ip-1", "hostname-1", "some_state", "otherqueue"), + ], + { + "cr-123456": ["queue-cb-st-compute-resource-cb-1", "queue-cb-st-compute-resource-cb-3"], + }, + ), ], ) def test_associate_nodenames_to_capacity_blocks( @@ -370,8 +457,9 @@ def test_associate_nodenames_to_capacity_blocks( nodes, expected_nodenames_in_capacity_block, ): - capacity_block_manager._capacity_blocks = capacity_blocks - capacity_block_manager._associate_nodenames_to_capacity_blocks(capacity_blocks, nodes) + mocked_capacity_blocks = self._build_capacity_blocks(capacity_blocks) + capacity_block_manager._capacity_blocks = mocked_capacity_blocks + capacity_block_manager._associate_nodenames_to_capacity_blocks(mocked_capacity_blocks, nodes) for capacity_block_id in capacity_block_manager._capacity_blocks.keys(): # assert in the nodenames list there are only nodes associated to the right queue and compute resource @@ -552,7 +640,9 @@ def test_update_slurm_reservation( # exception, keep previous values ( { - "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ) }, [ CapacityReservationInfo( @@ -567,7 +657,9 @@ def test_update_slurm_reservation( # internal error, because trying to update a capacity block not in the list, keep previous values ( { - "cr-123456": CapacityBlock("id", "queue-cb", "compute-resource-cb"), + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ), }, [ CapacityReservationInfo( @@ -582,8 +674,12 @@ def test_update_slurm_reservation( # update old values with new values ( { - "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), - "cr-234567": CapacityBlock("cr-234567", "queue-cb", "compute-resource-cb"), + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ), + "cr-234567": SimpleNamespace( + capacity_block_id="cr-234567", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ), }, [ CapacityReservationInfo( @@ -607,7 +703,8 @@ def test_update_capacity_blocks_info_from_ec2( caplog, ): caplog.set_level(logging.INFO) - capacity_block_manager._capacity_blocks = init_capacity_blocks + mocked_capacity_blocks = self._build_capacity_blocks(init_capacity_blocks) + capacity_block_manager._capacity_blocks = mocked_capacity_blocks expected_exception = isinstance(expected_error, AWSClientError) mocked_client = mocker.MagicMock() mocked_client.describe_capacity_reservations.side_effect = [ @@ -619,10 +716,10 @@ def test_update_capacity_blocks_info_from_ec2( with pytest.raises( CapacityBlockManagerError, match="Unable to retrieve Capacity Blocks information from EC2. Boto3Error" ): - capacity_block_manager._update_capacity_blocks_info_from_ec2(init_capacity_blocks) + capacity_block_manager._update_capacity_blocks_info_from_ec2(mocked_capacity_blocks) elif expected_error: - capacity_block_manager._update_capacity_blocks_info_from_ec2(init_capacity_blocks) + capacity_block_manager._update_capacity_blocks_info_from_ec2(mocked_capacity_blocks) assert_that(caplog.text).contains(expected_error) # assert that only existing item has been updated @@ -635,7 +732,7 @@ def test_update_capacity_blocks_info_from_ec2( ) assert_that(capacity_block_manager._capacity_blocks.get("cr-234567")).is_none() else: - capacity_block_manager._update_capacity_blocks_info_from_ec2(init_capacity_blocks) + capacity_block_manager._update_capacity_blocks_info_from_ec2(mocked_capacity_blocks) if init_capacity_blocks: # verify that all the blocks have the updated info from ec2 assert_that( @@ -660,24 +757,40 @@ def test_update_capacity_blocks_info_from_ec2( ( { "queue-cb": { - "compute-resource-cb": { - "CapacityType": "capacity-block", - "CapacityReservationId": "cr-123456", - } + "compute-resource-cb": {"CapacityType": "capacity-block", "CapacityReservationId": "cr-123456"} }, "queue1": { "compute-resource1": {"CapacityType": "on-demand", "CapacityReservationId": "cr-123456"} }, "queue-cb2": { - "compute-resource-cb2": { - "CapacityType": "capacity-block", - "CapacityReservationId": "cr-234567", - } + "compute-resource-cb2": {"CapacityType": "capacity-block", "CapacityReservationId": "cr-234567"} }, }, { - "cr-123456": CapacityBlock("cr-123456", "queue-cb", "compute-resource-cb"), - "cr-234567": CapacityBlock("cr-234567", "queue-cb2", "compute-resource-cb2"), + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", compute_resources_map={"queue-cb": {"compute-resource-cb"}} + ), + "cr-234567": SimpleNamespace( + capacity_block_id="cr-234567", compute_resources_map={"queue-cb2": {"compute-resource-cb2"}} + ), + }, + False, + ), + # Same CB in multiple queues + ( + { + "queue-cb": { + "compute-resource-cb": {"CapacityType": "capacity-block", "CapacityReservationId": "cr-123456"} + }, + "queue1": { + "compute-resource1": {"CapacityType": "capacity-block", "CapacityReservationId": "cr-123456"} + }, + }, + { + "cr-123456": SimpleNamespace( + capacity_block_id="cr-123456", + compute_resources_map={"queue-cb": {"compute-resource-cb"}, "queue1": {"compute-resource1"}}, + ), }, False, ), @@ -702,10 +815,22 @@ def test_retrieve_capacity_blocks_from_fleet_config( with pytest.raises(KeyError): capacity_block_manager._retrieve_capacity_blocks_from_fleet_config() else: - assert_that(expected_capacity_blocks).is_equal_to( + assert_that(self._build_capacity_blocks(expected_capacity_blocks)).is_equal_to( capacity_block_manager._retrieve_capacity_blocks_from_fleet_config() ) + @staticmethod + def _build_capacity_blocks(expected_capacity_blocks_structure: Dict[str, SimpleNamespace]): + """Convert dict with list of SimpleNamespaces to list of CapacityBlocks.""" + expected_capacity_blocks = {} + for capacity_block_id, capacity_block_structure in expected_capacity_blocks_structure.items(): + expected_capacity_block = CapacityBlock(capacity_block_id) + for queue, compute_resources in capacity_block_structure.compute_resources_map.items(): + for compute_resource in compute_resources: + expected_capacity_block.add_compute_resource(queue, compute_resource) + expected_capacity_blocks.update({capacity_block_id: expected_capacity_block}) + return expected_capacity_blocks + @pytest.mark.parametrize( ("compute_resource_config", "expected_result"), [