From afae3ae36a51caf03b0df233a1d89a20ee486f26 Mon Sep 17 00:00:00 2001 From: Thomas Hahn Date: Thu, 21 Nov 2024 14:23:22 -0500 Subject: [PATCH] Add option to run for a minimum number of cycles --- c++/triqs/mc_tools/mc_generic.cpp | 31 +++++++++++++++++++++++++------ c++/triqs/mc_tools/mc_generic.hpp | 7 +++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/c++/triqs/mc_tools/mc_generic.cpp b/c++/triqs/mc_tools/mc_generic.cpp index ed911739c..78d321931 100644 --- a/c++/triqs/mc_tools/mc_generic.cpp +++ b/c++/triqs/mc_tools/mc_generic.cpp @@ -61,6 +61,8 @@ namespace triqs::mc_tools { // start MPI monitors std::unique_ptr exception_monitor; if (params.propagate_exception and mpi::has_env) exception_monitor = std::make_unique(params.comm); + std::unique_ptr cycle_monitor; + if (params.continue_after_ncycles_done and mpi::has_env) cycle_monitor = std::make_unique(params.comm); // prepare the simulation parameters and statistics auto const rank = params.comm.rank(); @@ -68,13 +70,14 @@ namespace triqs::mc_tools { std::int64_t cycle_counter = 1; double next_print_info = 0.1; double next_check_except = params.check_exception_interval; + double next_check_cycles = params.check_cycles_interval; percentage_done_ = stop_flag ? 100 : 0; nmeasures_done_ = 0; // run simulation for (; !stop_flag; ++cycle_counter) { try { - // do length_cycle MC steps / cycle + // do cycle_length MC steps / cycle for (std::int64_t i = 0; i < params.cycle_length; i++) { if (triqs::signal_handler::received()) throw triqs::signal_handler::exception{}; // Metropolis step @@ -98,7 +101,7 @@ namespace triqs::mc_tools { // current cycle is interrupted, simulation is stopped below std::cerr << "mc_generic::run: Signal caught on node " << rank << "\n" << std::endl; } catch (std::exception const &err) { - // log the error and node number + // log the exception and node number, either abort or report to other nodes std::cerr << "mc_generic::run: Exception occured on node " << rank << "\n" << err.what() << std::endl; if (params.propagate_exception) { exception_monitor->report_local_event(); @@ -113,7 +116,7 @@ namespace triqs::mc_tools { double runtime = run_timer_; // print simulation info - if (runtime > next_print_info || percentage_done_ >= 100) { + if (runtime > next_print_info) { // increase time interval non-linearly next_print_info = 1.25 * runtime + 2.0; if (percentage_done_ < 0) { @@ -127,12 +130,27 @@ namespace triqs::mc_tools { // check for exceptions on other ranks if (exception_monitor && runtime > next_check_except) { - stop_flag = exception_monitor->event_on_any_rank(); next_check_except += params.check_exception_interval; + stop_flag |= exception_monitor->event_on_any_rank(); + } + + // check if we have done all requested cycles + if (percentage_done_ >= 100) { + if (cycle_monitor) { + // if continue_after_ncycles_done == true, report to other ranks and check if they are done as well + cycle_monitor->report_local_event(); + if (runtime > next_check_cycles) { + next_check_cycles += params.check_cycles_interval; + stop_flag |= cycle_monitor->event_on_all_ranks(); + } + } else { + // if continue_after_ncycles_done == false, stop the simulation + stop_flag = true; + } } // update stop flag - stop_flag |= (params.stop_callback() || triqs::signal_handler::received() || percentage_done_ >= 100); + stop_flag |= (params.stop_callback() || triqs::signal_handler::received()); } // stop timer @@ -145,12 +163,13 @@ namespace triqs::mc_tools { int status = (percentage_done_ >= 100 ? 0 : (triqs::signal_handler::received() ? 2 : 1)); triqs::signal_handler::stop(); - // stop exception monitor + // stop MPI monitors if (exception_monitor) { exception_monitor->finalize_communications(); if (exception_monitor->event_on_any_rank()) throw std::runtime_error("MC simulation stopped because an exception occurred on one of the MPI ranks"); } + if (cycle_monitor) cycle_monitor->finalize_communications(); // final reports if (status == 1) report_ << fmt::format("MC simulation stopped because stop_callback() returned true\n"); diff --git a/c++/triqs/mc_tools/mc_generic.hpp b/c++/triqs/mc_tools/mc_generic.hpp index c12e7c6c2..52b35ab84 100644 --- a/c++/triqs/mc_tools/mc_generic.hpp +++ b/c++/triqs/mc_tools/mc_generic.hpp @@ -115,8 +115,15 @@ namespace triqs::mc_tools { /// Should we calibrate the moves during the simulation? Usually false during the accumulation phase. bool enable_calibration = false; + /// Should we continue the simulation on the current rank after the given number of cycles is done and wait for + /// all other ranks to finish as well or should we stop immediately? + bool continue_after_ncycles_done = false; + /// Time interval (in seconds) after which the simulation checks for exceptions on other nodes. double check_exception_interval = 1; + + /// Time interval (in seconds) after which the simulation checks if all other nodes have finished their cycles. + double check_cycles_interval = 1; }; /**