From 5a4b89c9ec205fc1c040533db6cc0370ea29ed50 Mon Sep 17 00:00:00 2001
From: Fernando Pereira <fernando.pereira@epfl.ch>
Date: Wed, 13 Nov 2024 11:39:59 +0100
Subject: [PATCH] Move common funcs to toolbox.py

---
 tools/rebalance-corenrn-data.py | 43 ++++++---------------------------
 tools/rebalance-stats.py        | 39 +++---------------------------
 tools/toolbox.py                | 40 ++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 71 deletions(-)
 create mode 100644 tools/toolbox.py

diff --git a/tools/rebalance-corenrn-data.py b/tools/rebalance-corenrn-data.py
index b29ac1e9..29d11a31 100755
--- a/tools/rebalance-corenrn-data.py
+++ b/tools/rebalance-corenrn-data.py
@@ -12,17 +12,14 @@
 import heapq
 import itertools
 import logging
-import math
 import os
 import sys
 
-# Numpy may be required (histogram)
-numpy = None
+from toolbox import get_dat_entry_size as get_entry_size
+from toolbox import show_histogram, with_progress
 
 DEFAULT_OUTPUT_FILE = "rebalanced-files.dat"
 CORENRN_SKIP_MARK = "-1"
-PROGRESS_STEPS = 50
-DEFAULT_HISTOGRAM_NBINS = 40
 DEFAULT_RANKS_PER_MACHINE = 40
 
 
@@ -129,35 +126,6 @@ def batch(iterable, first=0):
                         out.write(entry + "\n")
 
 
-def get_entry_size(base_dir, dat_entry):
-    """Obtain the file size of a dat entry"""
-    dat_file = f"{dat_entry}_2.dat"
-    file_path = os.path.join(base_dir, dat_file)
-    return os.path.getsize(file_path)
-
-
-def with_progress(elements):
-    """A quick and easy generator for displaying progress while iterating"""
-    total_elems = len(elements)
-    report_every = math.ceil(total_elems / PROGRESS_STEPS)
-    logging.info(f"Processing {total_elems} entries")
-    for i, elem in enumerate(elements):
-        if i % report_every == 0:
-            print(f"{i:10} [{i*100/total_elems:3.0f}%]", file=sys.stderr)
-        yield elem
-
-
-def show_histogram(buckets, n_bins=DEFAULT_HISTOGRAM_NBINS):
-    """A simple histogram CLI visualizer"""
-    logging.info("Histogram of the Machine accumulated data")
-    freq, bins = numpy.histogram(buckets, bins=n_bins)
-    bin_start = bins[0]
-    for count, bin_end in zip(freq, bins[1:]):
-        if count:
-            print(f"  [{bin_start/(1024*1024):5.0f} - {bin_end/(1024*1024):5.0f}]: {count:0d}")
-        bin_start = bin_end
-
-
 def main():
     parser = argparse.ArgumentParser(
         usage="%(prog)s  [OPTION]...  <input_file>  <n_machines>",
@@ -207,8 +175,11 @@ def main():
     logging.basicConfig(level=logging_level, format="%(levelname)s :: %(message)s")
 
     if args.histogram:
-        global numpy
-        import numpy
+        try:
+            import numpy as _  # noqa
+        except ImportError:
+            logging.error("Numpy is required to compute histograms")
+            return 1
 
     if not os.path.isfile(args.input_file):
         logging.error("Input file could not be found!")
diff --git a/tools/rebalance-stats.py b/tools/rebalance-stats.py
index e1dc3d3a..cdb987d5 100755
--- a/tools/rebalance-stats.py
+++ b/tools/rebalance-stats.py
@@ -9,18 +9,16 @@
 """
 
 import argparse
-import math
-import numpy
 import os
-import sys
+from toolbox import get_dat_entry_size, show_histogram, with_progress
 
-PROGRESS_STEPS = 50
 CORENRN_SKIP_MARK = "-1"
-DEFAULT_HISTOGRAM_NBINS = 40
 DEFAULT_RANKS_PER_MACHINE = 40
 
 
 def files_dat_load_ranks(input_file, n_machines, ranks_per_machine, base_dir):
+    """From a files.dat compute the total amount of data to load per rank
+    """
     print(f"Reading from input file: {input_file}")
     base_dir = base_dir or os.path.dirname(input_file)
     n_ranks = n_machines * ranks_per_machine
@@ -32,7 +30,7 @@ def files_dat_load_ranks(input_file, n_machines, ranks_per_machine, base_dir):
         for i, line in enumerate(with_progress(file.readlines())):
             if line[:2] == CORENRN_SKIP_MARK:
                 continue
-            size = get_entry_size(base_dir, line.strip())
+            size = get_dat_entry_size(base_dir, line.strip())
             ranks_size[i % n_ranks] += size
 
     return ranks_size
@@ -81,34 +79,5 @@ def main():
     show_histogram(ranks_size)
 
 
-def show_histogram(buckets, n_bins=DEFAULT_HISTOGRAM_NBINS):
-    """A simple histogram CLI visualizer"""
-    MiB = float(1 << 20)
-    freq, bins = numpy.histogram(buckets, bins=n_bins)
-    bin_start = bins[0]
-    for count, bin_end in zip(freq, bins[1:]):
-        if count:
-            print(f"  [{bin_start/MiB:5.0f} - {bin_end/MiB:5.0f}]: {count:0d}")
-        bin_start = bin_end
-
-
-def get_entry_size(base_dir, dat_entry):
-    """Obtain the file size of a dat entry"""
-    dat_file = f"{dat_entry}_2.dat"
-    file_path = os.path.join(base_dir, dat_file)
-    return os.path.getsize(file_path)
-
-
-def with_progress(elements):
-    """A quick and easy generator for displaying progress while iterating"""
-    total_elems = len(elements)
-    report_every = math.ceil(total_elems / PROGRESS_STEPS)
-    print(f"Processing {total_elems} entries")
-    for i, elem in enumerate(elements):
-        if i % report_every == 0:
-            print(f"{i:10} [{i*100/total_elems:3.0f}%]", file=sys.stderr)
-        yield elem
-
-
 if __name__ == "__main__":
     main()
diff --git a/tools/toolbox.py b/tools/toolbox.py
new file mode 100644
index 00000000..5ec3a3b5
--- /dev/null
+++ b/tools/toolbox.py
@@ -0,0 +1,40 @@
+# Blue Brain Project - EPFL, 2024
+"""A library of functions shared across tools.
+"""
+
+import math
+import os
+import sys
+
+PROGRESS_STEPS = 50
+DEFAULT_HISTOGRAM_NBINS = 40
+
+
+def show_histogram(buckets, n_bins=DEFAULT_HISTOGRAM_NBINS):
+    """A simple histogram CLI visualizer"""
+    import numpy  # optional
+    MiB = float(1 << 20)
+    freq, bins = numpy.histogram(buckets, bins=n_bins)
+    bin_start = bins[0]
+    for count, bin_end in zip(freq, bins[1:]):
+        if count:
+            print(f"  [{bin_start/MiB:5.0f} - {bin_end/MiB:5.0f}]: {count:0d}")
+        bin_start = bin_end
+
+
+def get_dat_entry_size(base_dir, dat_entry):
+    """Obtain the file size of a dat entry"""
+    dat_file = f"{dat_entry}_2.dat"
+    file_path = os.path.join(base_dir, dat_file)
+    return os.path.getsize(file_path)
+
+
+def with_progress(elements):
+    """A quick and easy generator for displaying progress while iterating"""
+    total_elems = len(elements)
+    report_every = math.ceil(total_elems / PROGRESS_STEPS)
+    print(f"Processing {total_elems} entries")
+    for i, elem in enumerate(elements):
+        if i % report_every == 0:
+            print(f"{i:10} [{i*100/total_elems:3.0f}%]", file=sys.stderr)
+        yield elem