Skip to content

Commit

Permalink
Continue work on pseudo-scheduler
Browse files Browse the repository at this point in the history
Enable discovery of node pool. Remove extraneous
job and proc states from state machine.

Signed-off-by: Ralph Castain <[email protected]>
  • Loading branch information
rhc54 committed Sep 26, 2023
1 parent b8ee2c5 commit e69dbcf
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 951 deletions.
22 changes: 10 additions & 12 deletions src/mca/ras/base/ras_base_node.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -66,17 +66,17 @@ int prte_ras_base_node_insert(pmix_list_t *nodes, prte_job_t *jdata)
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), (long) num_nodes));

/* mark the job as being a large-cluster sim if that was requested */
if (1 < prte_ras_base.multiplier) {
prte_set_attribute(&jdata->attributes, PRTE_JOB_MULTI_DAEMON_SIM, PRTE_ATTR_GLOBAL, NULL,
PMIX_BOOL);
if (1 < prte_ras_base.multiplier && NULL != jdata) {
prte_set_attribute(&jdata->attributes, PRTE_JOB_MULTI_DAEMON_SIM,
PRTE_ATTR_GLOBAL, NULL, PMIX_BOOL);
}

/* set the size of the global array - this helps minimize time
* spent doing realloc's
*/
if (PRTE_SUCCESS
!= (rc = pmix_pointer_array_set_size(prte_node_pool,
num_nodes * prte_ras_base.multiplier))) {
rc = pmix_pointer_array_set_size(prte_node_pool,
num_nodes * prte_ras_base.multiplier);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
return rc;
}
Expand All @@ -96,17 +96,15 @@ int prte_ras_base_node_insert(pmix_list_t *nodes, prte_job_t *jdata)
break;
}
}
if (prte_hnp_is_allocated
&& !(PRTE_GET_MAPPING_DIRECTIVE(prte_rmaps_base.mapping)
& PRTE_MAPPING_NO_USE_LOCAL)) {
if (prte_hnp_is_allocated &&
!(PRTE_GET_MAPPING_DIRECTIVE(prte_rmaps_base.mapping) & PRTE_MAPPING_NO_USE_LOCAL)) {
if (NULL != hnp_node->name) {
free(hnp_node->name);
}
hnp_node->name = strdup("prte");
skiphnp = true;
PRTE_SET_MAPPING_DIRECTIVE(prte_rmaps_base.mapping, PRTE_MAPPING_NO_USE_LOCAL);
PRTE_FLAG_SET(hnp_node,
PRTE_NODE_NON_USABLE); // leave this node out of mapping operations
PRTE_FLAG_SET(hnp_node, PRTE_NODE_NON_USABLE); // leave this node out of mapping operations
}
}
}
Expand Down
52 changes: 41 additions & 11 deletions src/tools/psched/psched.c
Original file line number Diff line number Diff line change
Expand Up @@ -599,17 +599,19 @@ int main(int argc, char *argv[])
goto DONE;
}

/* setup the keepalive event registration */
memset(&xfer, 0, sizeof(myxfer_t));
PRTE_PMIX_CONSTRUCT_LOCK(&xfer.lock);
code = PMIX_ERR_JOB_TERMINATED;
PMIX_LOAD_PROCID(&pname, "PMIX_KEEPALIVE_PIPE", PMIX_RANK_UNDEF);
PMIX_INFO_LOAD(&info, PMIX_EVENT_AFFECTED_PROC, &pname, PMIX_PROC);
PMIx_Register_event_handler(&code, 1, &info, 1, parent_died_fn, evhandler_reg_callbk,
(void *) &xfer);
PRTE_PMIX_WAIT_THREAD(&xfer.lock);
PMIX_INFO_DESTRUCT(&info);
PRTE_PMIX_DESTRUCT_LOCK(&xfer.lock);
if (pmix_cmd_line_is_taken(&results, PRTE_CLI_KEEPALIVE)) {
/* setup the keepalive event registration */
memset(&xfer, 0, sizeof(myxfer_t));
PRTE_PMIX_CONSTRUCT_LOCK(&xfer.lock);
code = PMIX_ERR_JOB_TERMINATED;
PMIX_LOAD_PROCID(&pname, "PMIX_KEEPALIVE_PIPE", PMIX_RANK_UNDEF);
PMIX_INFO_LOAD(&info, PMIX_EVENT_AFFECTED_PROC, &pname, PMIX_PROC);
PMIx_Register_event_handler(&code, 1, &info, 1, parent_died_fn, evhandler_reg_callbk,
(void *) &xfer);
PRTE_PMIX_WAIT_THREAD(&xfer.lock);
PMIX_INFO_DESTRUCT(&info);
PRTE_PMIX_DESTRUCT_LOCK(&xfer.lock);
}

/* create my job data object */
jdata = PMIX_NEW(prte_job_t);
Expand Down Expand Up @@ -643,6 +645,20 @@ int main(int argc, char *argv[])
pptr->node = node;
pmix_pointer_array_set_item(jdata->procs, PRTE_PROC_MY_NAME->rank, pptr);

// pass along any hostfile option
opt = pmix_cmd_line_get_param(&results, PRTE_CLI_HOSTFILE);
if (NULL != opt) {
prte_set_attribute(&app->attributes, PRTE_APP_HOSTFILE, PRTE_ATTR_GLOBAL,
opt->values[0], PMIX_STRING);
}

// pass along any dash-host option
opt = pmix_cmd_line_get_param(&results, PRTE_CLI_HOST);
if (NULL != opt) {
prte_set_attribute(&app->attributes, PRTE_APP_DASH_HOST, PRTE_ATTR_GLOBAL,
opt->values[0], PMIX_STRING);
}

/* setup to detect any external allocation */
ret = pmix_mca_base_framework_open(&prte_ras_base_framework,
PMIX_MCA_BASE_OPEN_DEFAULT);
Expand Down Expand Up @@ -674,6 +690,17 @@ int main(int argc, char *argv[])
free(output);
}

// check for default hostfile CLI option
opt = pmix_cmd_line_get_param(&results, PRTE_CLI_DEFAULT_HOSTFILE);
if (NULL != opt) {
if (NULL != prte_default_hostfile) {
// command line overrides environ
free(prte_default_hostfile);
}
prte_default_hostfile = strdup(opt->values[0]);
prte_default_hostfile_given = true;
}

// setup the scheduler itself
psched_scheduler_init();

Expand All @@ -684,6 +711,9 @@ int main(int argc, char *argv[])
prte_process_info.nodename);
}

// trigger the state event to read the allocation
PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_ALLOCATE);

/* loop the event lib until an exit event is detected */
while (prte_event_base_active) {
prte_event_loop(prte_event_base, PRTE_EVLOOP_ONCE);
Expand Down
Loading

0 comments on commit e69dbcf

Please sign in to comment.