Skip to content

Commit

Permalink
Implement support for PMIx "bootstrap" group construction
Browse files Browse the repository at this point in the history
Support the PMIx "bootstrap" method for group construction by
correctly aggregating the individual contributions and then
redistributing them.

Signed-off-by: Ralph Castain <[email protected]>
  • Loading branch information
rhc54 committed Dec 18, 2024
1 parent fcb01ca commit f065667
Show file tree
Hide file tree
Showing 3 changed files with 220 additions and 102 deletions.
88 changes: 84 additions & 4 deletions src/mca/grpcomm/direct/grpcomm_direct.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ typedef struct {
pmix_proc_t *members; // initially supplied procs
size_t nmembers;
size_t bootstrap;
bool follower;
pmix_proc_t *addmembers; // procs supplied as add-members
size_t naddmembers;
} prte_grpcomm_direct_group_signature_t;
Expand Down Expand Up @@ -106,10 +107,28 @@ typedef struct {
size_t ndmns;
/** my index in the dmns array */
unsigned long my_rank;
/* number of buckets expected */
size_t nexpected;
/* number reported in */
size_t nreported;
/* type of collective */
bool bootstrap;

/*** NON-BOOTSTRAP TRACKERS ***/
size_t nexpected; // number of buckets expected
size_t nreported; // number reported in

/*** BOOTSTRAP TRACKERS ***/
// "leaders" are group members reporting as
// themselves for bootstrap - they know how
// many leaders there are (which is in the bootstrap
// parameter), but not who they are. Bootstrap is
// complete when nleaders_reported == bootstrap
// AND naddmembers_reported == naddmembers
size_t nleaders; // number of leaders expected
size_t nleaders_reported; // number reported in
// "add-members" are procs that report with NULL
// for the proc parameter - thereby indicating that
// they don't know the other procs in the group
size_t nfollowers; // number of add-member procs expected to participate
size_t nfollowers_reported; // number reported in

/* controls values */
bool assignID;
int timeout;
Expand Down Expand Up @@ -185,6 +204,67 @@ void prte_grpcomm_direct_grp_release(int status, pmix_proc_t *sender,
pmix_data_buffer_t *buffer,
prte_rml_tag_t tag, void *cbdata);

static inline void print_signature(prte_grpcomm_direct_group_signature_t *sig)
{
char **msg = NULL;
char *tmp;
size_t n;

PMIx_Argv_append_nosize(&msg, "SIGNATURE:");
pmix_asprintf(&tmp, "\tOP: %s", PMIx_Group_operation_string(sig->op));
PMIx_Argv_append_nosize(&msg, tmp);
free(tmp);

pmix_asprintf(&tmp, "\tGRPID: %s", sig->groupID);
PMIx_Argv_append_nosize(&msg, tmp);
free(tmp);

pmix_asprintf(&tmp, "\tASSIGN CTXID: %s", sig->assignID ? "T" : "F");
PMIx_Argv_append_nosize(&msg, tmp);
free(tmp);

if (sig->assignID) {
pmix_asprintf(&tmp, "\tCTXID: %lu", sig->ctxid);
PMIx_Argv_append_nosize(&msg, tmp);
free(tmp);
}

pmix_asprintf(&tmp, "\tNMEMBERS: %lu", sig->nmembers);
PMIx_Argv_append_nosize(&msg, tmp);
free(tmp);
if (0 < sig->nmembers) {
for (n=0; n < sig->nmembers; n++) {
pmix_asprintf(&tmp, "\t\t%s", PMIX_NAME_PRINT(&sig->members[n]));
PMIx_Argv_append_nosize(&msg, tmp);
free(tmp);
}
}

pmix_asprintf(&tmp, "\tBOOTSTRAP: %lu", sig->bootstrap);
PMIx_Argv_append_nosize(&msg, tmp);
free(tmp);

pmix_asprintf(&tmp, "\tFOLLOWER: %s", sig->follower ? "T" : "F");
PMIx_Argv_append_nosize(&msg, tmp);
free(tmp);

pmix_asprintf(&tmp, "\tNADDMEMBERS: %lu", sig->naddmembers);
PMIx_Argv_append_nosize(&msg, tmp);
free(tmp);
if (0 < sig->naddmembers) {
for (n=0; n < sig->naddmembers; n++) {
pmix_asprintf(&tmp, "\t\t%s", PMIX_NAME_PRINT(&sig->addmembers[n]));
PMIx_Argv_append_nosize(&msg, tmp);
free(tmp);
}
}

tmp = PMIx_Argv_join(msg, '\n');
PMIx_Argv_free(msg);
pmix_output(0, "%s", tmp);
free(tmp);
}

END_C_DECLS

#endif
5 changes: 5 additions & 0 deletions src/mca/grpcomm/direct/grpcomm_direct_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ static void sgcon(prte_grpcomm_direct_group_signature_t *p)\
p->members = NULL;
p->nmembers = 0;
p->bootstrap = 0;
p->follower = false;
p->addmembers = NULL;
p->naddmembers = 0;
}
Expand Down Expand Up @@ -133,6 +134,10 @@ static void gccon(prte_grpcomm_group_t *p)
p->ndmns = 0;
p->nexpected = 0;
p->nreported = 0;
p->nleaders = 0;
p->nleaders_reported = 0;
p->nfollowers = 0;
p->nfollowers_reported = 0;
p->assignID = false;
p->timeout = 0;
p->memsize = 0;
Expand Down
Loading

0 comments on commit f065667

Please sign in to comment.