Merge pull request #112 from QUARK-framework/111-data-loss-on-keyboar…

…d-interrupt Save results of current benchmark run when CTRL-C is detected
QUARK-framework · Mar 4, 2024 · a20dfa6 · a20dfa6
2 parents 7e46b0a + 0d702ec
commit a20dfa6
Showing 1 changed file with 58 additions and 53 deletions.
diff --git a/src/BenchmarkManager.py b/src/BenchmarkManager.py
@@ -125,59 +125,64 @@ def run_benchmark(self, benchmark_backlog: list, repetitions: int):
         """
         git_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", )
         git_revision_number, git_uncommitted_changes = get_git_revision(git_dir)
-
-        try:
-            for idx_backlog, backlog_item in enumerate(benchmark_backlog):
-                benchmark_records: [BenchmarkRecord] = []
-                path = f"{self.store_dir}/benchmark_{idx_backlog}"
-                Path(path).mkdir(parents=True, exist_ok=True)
-                with open(f"{path}/application_config.json", 'w') as filehandler:
-                    json.dump(backlog_item["config"], filehandler, indent=2)
-                for i in range(1, repetitions + 1):
-                    logging.info(f"Running backlog item {idx_backlog + 1}/{len(benchmark_backlog)},"
-                                 f" Iteration {i}/{repetitions}:")
-                    try:
-
-                        self.benchmark_record_template = BenchmarkRecord(idx_backlog,
-                                                                         datetime.today().strftime('%Y-%m-%d-%H-%M-%S'),
-                                                                         git_revision_number, git_uncommitted_changes,
-                                                                         i, repetitions)
-                        self.application.metrics.set_module_config(backlog_item["config"])
-                        problem, preprocessing_time = self.application.preprocess(None, backlog_item["config"],
-                                                                                  store_dir=path, rep_count=i)
-                        self.application.metrics.set_preprocessing_time(preprocessing_time)
-                        self.application.save(path, i)
-
-                        processed_input, benchmark_record = self.traverse_config(backlog_item["submodule"], problem,
-                                                                                 path, rep_count=i)
-
-                        _, postprocessing_time = self.application.postprocess(processed_input, None, store_dir=path,
-                                                                              rep_count=i)
-                        self.application.metrics.set_postprocessing_time(postprocessing_time)
-                        self.application.metrics.validate()
-                        benchmark_record.append_module_record_left(deepcopy(self.application.metrics))
-                        benchmark_records.append(benchmark_record)
-
-                    except Exception as error:
-                        logging.exception(f"Error during benchmark run: {error}", exc_info=True)
-                        if self.fail_fast:
-                            raise
-
-                for record in benchmark_records:
-                    record.sum_up_times()
-
-                # Wait until all MPI processes have finished and save results on rank 0
-                comm.Barrier()
-                if comm.Get_rank() == 0:
-                    with open(f"{path}/results.json", 'w') as filehandler:
-                        json.dump([x.get() for x in benchmark_records], filehandler, indent=2, cls=NumpyEncoder)
-
-                logging.info("")
-                logging.info(" =============== Run finished =============== ")
-                logging.info("")
-
-        except KeyboardInterrupt:
-            logging.warning("CTRL-C detected. Still trying to create results.json.")
+        break_flag = False
+
+        for idx_backlog, backlog_item in enumerate(benchmark_backlog):
+            benchmark_records: [BenchmarkRecord] = []
+            path = f"{self.store_dir}/benchmark_{idx_backlog}"
+            Path(path).mkdir(parents=True, exist_ok=True)
+            with open(f"{path}/application_config.json", 'w') as filehandler:
+                json.dump(backlog_item["config"], filehandler, indent=2)
+            for i in range(1, repetitions + 1):
+                logging.info(f"Running backlog item {idx_backlog + 1}/{len(benchmark_backlog)},"
+                             f" Iteration {i}/{repetitions}:")
+                try:
+
+                    self.benchmark_record_template = BenchmarkRecord(idx_backlog,
+                                                                     datetime.today().strftime('%Y-%m-%d-%H-%M-%S'),
+                                                                     git_revision_number, git_uncommitted_changes,
+                                                                     i, repetitions)
+                    self.application.metrics.set_module_config(backlog_item["config"])
+                    problem, preprocessing_time = self.application.preprocess(None, backlog_item["config"],
+                                                                              store_dir=path, rep_count=i)
+                    self.application.metrics.set_preprocessing_time(preprocessing_time)
+                    self.application.save(path, i)
+
+                    processed_input, benchmark_record = self.traverse_config(backlog_item["submodule"], problem,
+                                                                             path, rep_count=i)
+
+                    _, postprocessing_time = self.application.postprocess(processed_input, None, store_dir=path,
+                                                                          rep_count=i)
+                    self.application.metrics.set_postprocessing_time(postprocessing_time)
+                    self.application.metrics.validate()
+                    benchmark_record.append_module_record_left(deepcopy(self.application.metrics))
+                    benchmark_records.append(benchmark_record)
+
+                except KeyboardInterrupt:
+                    logging.warning("CTRL-C detected during run_benchmark. Still trying to create results.json.")
+                    break_flag = True
+                    break
+
+                except Exception as error:
+                    logging.exception(f"Error during benchmark run: {error}", exc_info=True)
+                    if self.fail_fast:
+                        raise
+
+            for record in benchmark_records:
+                record.sum_up_times()
+
+            # Wait until all MPI processes have finished and save results on rank 0
+            comm.Barrier()
+            if comm.Get_rank() == 0:
+                with open(f"{path}/results.json", 'w') as filehandler:
+                    json.dump([x.get() for x in benchmark_records], filehandler, indent=2, cls=NumpyEncoder)
+
+            logging.info("")
+            logging.info(" =============== Run finished =============== ")
+            logging.info("")
+
+            if break_flag:
+                break
 
     def traverse_config(self, module: dict, input_data: any, path: str, rep_count: int) -> (any, BenchmarkRecord):
         """