Skip to content

Commit

Permalink
Merge #588: run parallel jobs without a lock file
Browse files Browse the repository at this point in the history
  • Loading branch information
ftessier authored Apr 12, 2021
2 parents 5c4f81e + 0225835 commit d2ec76e
Show file tree
Hide file tree
Showing 39 changed files with 1,125 additions and 102 deletions.
102 changes: 87 additions & 15 deletions HEN_HOUSE/egs++/egs_application.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ using namespace std;
#include <sys/statvfs.h>
#endif

#define MAXIMUM_JOB_NUMBER 8192 // GPSC1: 256 nodes with 16 cores (32 threads)
//#define MAXIMUM_JOB_NUMBER 1024

static char __egs_app_msg1[] = "EGS_Application::EGS_Application(int,char**):";
static char __egs_app_msg2[] = "EGS_Application::initSimulation():";
static char __egs_app_msg3[] = "EGS_Application::runSimulation():";
Expand Down Expand Up @@ -220,7 +223,7 @@ void EGS_Application::storeGeometryStep(int ireg, int inew,
}

EGS_Application::EGS_Application(int argc, char **argv) : input(0), geometry(0),
source(0), rndm(0), run(0), simple_run(false), current_case(0),
source(0), rndm(0), run(0), simple_run(false), uniform_run(false), current_case(0),
last_case(0), data_out(0), data_in(0), a_objects(0),
ghistory(new EGS_GeometryHistory) {

Expand Down Expand Up @@ -389,22 +392,43 @@ EGS_Application::EGS_Application(int argc, char **argv) : input(0), geometry(0),
i_parallel = 0;
}
}
else if (have_np || have_ip)
egsWarning("%s\n to specify a parallel run you need both,"
" the -P and -j command line options\n",__egs_app_msg1);
else if (have_np && !have_ip) { // user wants to reset n_parallel
// and combine parallel jobs
n_parallel = ::strtol(npar.c_str(),0,10);
simple_run = true;
}
else if (n_parallel && !have_ip) { // user wants to combine
// parallel jobs
simple_run = true;
}
else if (!have_np && have_ip) {
egsWarning("\n%s\n to specify a parallel run you need both,"
" the -P and -j command line options\n\n",__egs_app_msg1);
}

//
// *** see if user wants simple job control
//
{
for (int j=1; j<argc; j++) {
string tmp = argv[j];
if (tmp == "-s" || tmp == "--simple-run") {
simple_run = true;
//for(int i=j; i<argc-1; i++) argv[i] = argv[i+1];
//argc--;
break;
}
for (int j=1; j<argc; j++) {
string tmp = argv[j];
if (tmp == "-s" || tmp == "--simple-run") {
simple_run = true;
//for(int i=j; i<argc-1; i++) argv[i] = argv[i+1];
//argc--;
break;
}
}

//
// *** See if user wants uniform job control.
// (Takes precedence over simple job control)
//
for (int j=1; j<argc; j++) {
string tmp = argv[j];
if (tmp == "-u" || tmp == "--urc") {
uniform_run = true;
simple_run = false;
break;
}
}

Expand Down Expand Up @@ -561,6 +585,39 @@ int EGS_Application::addState(istream &data) {
return 0;
}

bool fileExists(const string &name) {
struct stat buffer;
return (stat(name.c_str(), &buffer) == 0);
}

int EGS_Application::howManyJobsDone() {

char buf[512];
int n_of_egsdat = 0;

for (int i = first_parallel; i < first_parallel + n_parallel; i++) {
sprintf(buf,"%s_w%d.egsdat",final_output_file.c_str(),i);
string dfile = egsJoinPath(app_dir,buf);
if (fileExists(dfile)) {
n_of_egsdat++;
}
}

return n_of_egsdat;
}

int EGS_Application::combinePartialResults() {
int err = combineResults();
if (err) {
return err;
}
for (int j=0; j<a_objects_list.size(); ++j) {
a_objects_list[j]->reportResults();
}
outputResults();
return 0;
}

int EGS_Application::combineResults() {
egsInformation(
"\n Suming the following .egsdat files:\n"
Expand All @@ -571,8 +628,16 @@ int EGS_Application::combineResults() {
EGS_I64 last_ncase = 0;
int ndat = 0;
bool ok = true;
for (int j=1; j<500; j++) {
sprintf(buf,"%s_w%d.egsdat",output_file.c_str(),j);
/*
If trying to combine results and n_parallel set to 0,
use a hard-coded value for number of jobs.This is possible
if -P njobs was not passed as argument.
*/
if (!n_parallel) {
n_parallel = MAXIMUM_JOB_NUMBER;
}
for (int j=first_parallel; j < first_parallel + n_parallel; j++) {
sprintf(buf,"%s_w%d.egsdat",final_output_file.c_str(),j);
string dfile = egsJoinPath(app_dir,buf);
ifstream data(dfile.c_str());
if (data) {
Expand Down Expand Up @@ -648,6 +713,9 @@ int EGS_Application::initRunControl() {
if (simple_run) {
run = new EGS_RunControl(this);
}
else if (uniform_run) {
run = new EGS_UniformRunControl(this);
}
else {
run = EGS_RunControl::getRunControlObject(this);
}
Expand Down Expand Up @@ -751,6 +819,10 @@ void EGS_Application::describeSimulation() {
rndm->describeRNG();
egsInformation("\n\n");
}
if (run) {
run->describeRCO();
egsInformation("\n\n");
}
if (a_objects_list.size() > 0) {
egsInformation("The following ausgab objects are included in the simulation\n");
egsInformation("===========================================================\n\n");
Expand Down
21 changes: 20 additions & 1 deletion HEN_HOUSE/egs++/egs_application.h
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,24 @@ class EGS_EXPORT EGS_Application {
*/
virtual int combineResults();

/*! \brief Combine intermediate results from parallel runs.
Calls combineResults, followed by the output of intermediate
results. Currently used by the uniform RCO while a watcher job
is waiting for all jobs to complete.
*/
virtual int combinePartialResults();

/*! \brief Counts how many *.egsdat files in app folder.
Used by the uniform RCO to estimate how many parallel runs
completed. This RCO initially deletes existing *.egsdat files
to avoid counting files from previous runs. It is an estimate
since some jobs might have failed.
*/
int howManyJobsDone();


/*! \brief Output intermediate results.
This function stores the state of the application to a data
Expand Down Expand Up @@ -1053,7 +1071,8 @@ class EGS_EXPORT EGS_Application {
i_parallel, //!< Job index in parallel runs
first_parallel; //!< first parallel job number
bool batch_run; //!< Interactive or batch run.
bool simple_run; //!< Use a simple run control even for parallel runs
bool simple_run; //!< Use a simple run control object for parallel runs
bool uniform_run; //!< Use a uniform run control object for parallel runs
bool is_pegsless; //!< set to true if a pegsless run

EGS_Particle p; /*!< Parameters of the particle that just
Expand Down
Loading

0 comments on commit d2ec76e

Please sign in to comment.