Skip to content

Commit

Permalink
Split oversubscribed job list
Browse files Browse the repository at this point in the history
Split oversubscribed job list, to have ready to consume knowledge about
*  oversubscribed job list with single node allocation
*  oversubscribed job list with multiple node allocation
*  oversubscribed single node list
*  oversubscribed multi node list

Signed-off-by: Luca Carrogu <[email protected]>
  • Loading branch information
lukeseawalker committed Sep 5, 2023
1 parent b729445 commit 7b79c6f
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 13 deletions.
38 changes: 29 additions & 9 deletions src/slurm_plugin/instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,11 @@ def _add_instances_for_resume_file(
if not self.temp_jls_for_node_sharing:
# node scaling for oversubscribe nodes
self._scaling_for_nodes(
node_list=slurm_resume_data.nodes_oversubscribe,
node_list=list(
dict.fromkeys(
slurm_resume_data.single_node_oversubscribe + slurm_resume_data.multi_node_oversubscribe
)
),
launch_batch_size=launch_batch_size,
update_node_address=update_node_address,
all_or_nothing_batch=all_or_nothing_batch,
Expand Down Expand Up @@ -800,7 +804,8 @@ def _get_slurm_resume_data(self, slurm_resume: Dict[str, any], node_list: List[s
SlurmResumeData object contains the following:
* the node list for jobs with oversubscribe != NO
* the node list for jobs with oversubscribe == NO
* the job list with oversubscribe != NO
* the job list with single node allocation with oversubscribe != NO
* the job list with multi node allocation with oversubscribe != NO
* the job list with single node allocation with oversubscribe == NO
* the job list with multi node allocation with oversubscribe == NO
Expand Down Expand Up @@ -843,10 +848,12 @@ def _get_slurm_resume_data(self, slurm_resume: Dict[str, any], node_list: List[s
"""
jobs_single_node_no_oversubscribe = []
jobs_multi_node_no_oversubscribe = []
jobs_oversubscribe = []
jobs_single_node_oversubscribe = []
jobs_multi_node_oversubscribe = []
single_node_no_oversubscribe = []
multi_node_no_oversubscribe = []
nodes_oversubscribe = []
single_node_oversubscribe = []
multi_node_oversubscribe = []

slurm_resume_jobs = self._parse_slurm_resume(slurm_resume)

Expand All @@ -859,12 +866,21 @@ def _get_slurm_resume_data(self, slurm_resume: Dict[str, any], node_list: List[s
jobs_multi_node_no_oversubscribe.append(job)
multi_node_no_oversubscribe.extend(job.nodes_resume)
else:
jobs_oversubscribe.append(job)
nodes_oversubscribe.extend(job.nodes_resume)
if len(job.nodes_resume) == 1:
jobs_single_node_oversubscribe.append(job)
single_node_oversubscribe.extend(job.nodes_resume)
else:
jobs_multi_node_oversubscribe.append(job)
multi_node_oversubscribe.extend(job.nodes_resume)

nodes_difference = list(
set(node_list)
- (set(nodes_oversubscribe) | set(single_node_no_oversubscribe) | set(multi_node_no_oversubscribe))
- (
set(single_node_oversubscribe)
| set(multi_node_oversubscribe)
| set(single_node_no_oversubscribe)
| set(multi_node_no_oversubscribe)
)
)
if nodes_difference:
logger.warning(
Expand All @@ -873,8 +889,12 @@ def _get_slurm_resume_data(self, slurm_resume: Dict[str, any], node_list: List[s
)
self._update_failed_nodes(set(nodes_difference), "InvalidNodenameError")
return SlurmResumeData(
nodes_oversubscribe=list(dict.fromkeys(nodes_oversubscribe)),
jobs_oversubscribe=jobs_oversubscribe,
# With Oversubscribe
single_node_oversubscribe=list(dict.fromkeys(single_node_oversubscribe)),
multi_node_oversubscribe=list(dict.fromkeys(multi_node_oversubscribe)),
jobs_single_node_oversubscribe=jobs_single_node_oversubscribe,
jobs_multi_node_oversubscribe=jobs_multi_node_oversubscribe,
# With No Oversubscribe
single_node_no_oversubscribe=single_node_no_oversubscribe,
multi_node_no_oversubscribe=multi_node_no_oversubscribe,
jobs_single_node_no_oversubscribe=jobs_single_node_no_oversubscribe,
Expand Down
12 changes: 8 additions & 4 deletions src/slurm_plugin/slurm_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,14 +154,18 @@ class SlurmResumeData:
jobs_single_node_no_oversubscribe: List[SlurmResumeJob]
# List of exclusive job allocated to more than 1 node each
jobs_multi_node_no_oversubscribe: List[SlurmResumeJob]
# List of non-exclusive job
jobs_oversubscribe: List[SlurmResumeJob]
# List of non-exclusive job allocated to 1 node each
jobs_single_node_oversubscribe: List[SlurmResumeJob]
# List of non-exclusive job allocated to more than 1 node each
jobs_multi_node_oversubscribe: List[SlurmResumeJob]
# List of node allocated to single node exclusive job
single_node_no_oversubscribe: List[str]
# List of node allocated to multiple node exclusive job
multi_node_no_oversubscribe: List[str]
# List of node allocated to non-exclusive job
nodes_oversubscribe: List[str]
# List of node allocated to single node non-exclusive job
single_node_oversubscribe: List[str]
# List of node allocated to multiple node non-exclusive job
multi_node_oversubscribe: List[str]


class SlurmNode(metaclass=ABCMeta):
Expand Down

0 comments on commit 7b79c6f

Please sign in to comment.