Skip to content

Commit

Permalink
Fix support for hetero LSF environments
Browse files Browse the repository at this point in the history
Don't assume that all nodes share the same topology
as the HNP - it is an unnecessary restriction since
we know the actual topology of each node.

Signed-off-by: Ralph Castain <[email protected]>
(cherry picked from commit c515f92)
  • Loading branch information
rhc54 committed Oct 29, 2024
1 parent 8b6551a commit 63f41e3
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 6 deletions.
6 changes: 6 additions & 0 deletions src/docs/show-help-files/help-rmaps_rank_file.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,9 @@ to a different process.
If this is intentional then you must pass the "overload-allowed"
qualifier to the --bind-to option.
--bind-to :overload-allowed
#
[resource-not-found]
The specified LSF affinity file contained a node (%s) that is not in your
allocation. We therefore cannot map a process rank to it. Please
check your allocation and affinity file to ensure the latter only
contains allocated nodes.
61 changes: 55 additions & 6 deletions src/mca/rmaps/rank_file/rmaps_rank_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,28 @@ static int prte_rmaps_rf_process_lsf_affinity_hostfile(prte_job_t *jdata,
return PRTE_SUCCESS;
}

static bool quickmatch(prte_node_t *nd, char *name)
{
int n;

if (0 == strcmp(nd->name, name)) {
return true;
}
if (0 == strcmp(nd->name, prte_process_info.nodename) &&
(0 == strcmp(name, "localhost") ||
0 == strcmp(name, "127.0.0.1"))) {
return true;
}
if (NULL != nd->aliases) {
for (n=0; NULL != nd->aliases[n]; n++) {
if (0 == strcmp(nd->aliases[n], name)) {
return true;
}
}
}
return false;
}

static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, char **aff_rankfile)
{
FILE *fp;
Expand All @@ -769,9 +791,9 @@ static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, c
char *tmp_str = NULL;
size_t len;
char **cpus;
int i;
int i, j;
hwloc_obj_t obj;
prte_topology_t *my_topo = NULL;
prte_node_t *node, *nptr;

if( NULL != *aff_rankfile) {
free(*aff_rankfile);
Expand Down Expand Up @@ -839,12 +861,39 @@ static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, c
// Convert the Physical CPU set from LSF to a Hwloc logical CPU set
pmix_output_verbose(20, prte_rmaps_base_framework.framework_output,
"mca:rmaps:rf: (lsf) Convert Physical CPUSET from <%s>", sep);
my_topo = (prte_topology_t *) pmix_pointer_array_get_item(prte_node_topologies, 0);

// find the named host
nptr = NULL;
for (j = 0; j < prte_node_pool->size; j++) {
node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, j);
if (NULL == node) {
continue;
}
if (quickmatch(node, hstname)) {
nptr = node;
break;
}
}
if (NULL == nptr) {
/* wasn't found - that is an error */
pmix_show_help("help-rmaps_rank_file.txt",
"resource-not-found", true,
hstname);
fclose(fp);
close(fp_rank);
return PRTE_ERROR;
}

cpus = PMIX_ARGV_SPLIT_COMPAT(sep, ',');
for(i = 0; NULL != cpus[i]; ++i) {
// assume HNP has the same topology as other nodes
obj = hwloc_get_pu_obj_by_os_index(my_topo->topo, strtol(cpus[i], NULL, 10)) ;

// get the specified object
obj = hwloc_get_pu_obj_by_os_index(nptr->topology->topo, strtol(cpus[i], NULL, 10)) ;
if (NULL == obj) {
PMIX_ARGV_FREE_COMPAT(cpus);
fclose(fp);
close(fp_rank);
return PRTE_ERROR;
}
free(cpus[i]);
// 10 max number of digits in an int
cpus[i] = (char*)malloc(sizeof(char) * 10);
Expand Down

0 comments on commit 63f41e3

Please sign in to comment.