Skip to content

Commit

Permalink
Move common funcs to toolbox.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ferdonline committed Nov 13, 2024
1 parent e239326 commit 5a4b89c
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 71 deletions.
43 changes: 7 additions & 36 deletions tools/rebalance-corenrn-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,14 @@
import heapq
import itertools
import logging
import math
import os
import sys

# Numpy may be required (histogram)
numpy = None
from toolbox import get_dat_entry_size as get_entry_size
from toolbox import show_histogram, with_progress

DEFAULT_OUTPUT_FILE = "rebalanced-files.dat"
CORENRN_SKIP_MARK = "-1"
PROGRESS_STEPS = 50
DEFAULT_HISTOGRAM_NBINS = 40
DEFAULT_RANKS_PER_MACHINE = 40


Expand Down Expand Up @@ -129,35 +126,6 @@ def batch(iterable, first=0):
out.write(entry + "\n")


def get_entry_size(base_dir, dat_entry):
"""Obtain the file size of a dat entry"""
dat_file = f"{dat_entry}_2.dat"
file_path = os.path.join(base_dir, dat_file)
return os.path.getsize(file_path)


def with_progress(elements):
"""A quick and easy generator for displaying progress while iterating"""
total_elems = len(elements)
report_every = math.ceil(total_elems / PROGRESS_STEPS)
logging.info(f"Processing {total_elems} entries")
for i, elem in enumerate(elements):
if i % report_every == 0:
print(f"{i:10} [{i*100/total_elems:3.0f}%]", file=sys.stderr)
yield elem


def show_histogram(buckets, n_bins=DEFAULT_HISTOGRAM_NBINS):
"""A simple histogram CLI visualizer"""
logging.info("Histogram of the Machine accumulated data")
freq, bins = numpy.histogram(buckets, bins=n_bins)
bin_start = bins[0]
for count, bin_end in zip(freq, bins[1:]):
if count:
print(f" [{bin_start/(1024*1024):5.0f} - {bin_end/(1024*1024):5.0f}]: {count:0d}")
bin_start = bin_end


def main():
parser = argparse.ArgumentParser(
usage="%(prog)s [OPTION]... <input_file> <n_machines>",
Expand Down Expand Up @@ -207,8 +175,11 @@ def main():
logging.basicConfig(level=logging_level, format="%(levelname)s :: %(message)s")

if args.histogram:
global numpy
import numpy
try:
import numpy as _ # noqa
except ImportError:
logging.error("Numpy is required to compute histograms")
return 1

if not os.path.isfile(args.input_file):
logging.error("Input file could not be found!")
Expand Down
39 changes: 4 additions & 35 deletions tools/rebalance-stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,16 @@
"""

import argparse
import math
import numpy
import os
import sys
from toolbox import get_dat_entry_size, show_histogram, with_progress

PROGRESS_STEPS = 50
CORENRN_SKIP_MARK = "-1"
DEFAULT_HISTOGRAM_NBINS = 40
DEFAULT_RANKS_PER_MACHINE = 40


def files_dat_load_ranks(input_file, n_machines, ranks_per_machine, base_dir):
"""From a files.dat compute the total amount of data to load per rank
"""
print(f"Reading from input file: {input_file}")
base_dir = base_dir or os.path.dirname(input_file)
n_ranks = n_machines * ranks_per_machine
Expand All @@ -32,7 +30,7 @@ def files_dat_load_ranks(input_file, n_machines, ranks_per_machine, base_dir):
for i, line in enumerate(with_progress(file.readlines())):
if line[:2] == CORENRN_SKIP_MARK:
continue
size = get_entry_size(base_dir, line.strip())
size = get_dat_entry_size(base_dir, line.strip())
ranks_size[i % n_ranks] += size

return ranks_size
Expand Down Expand Up @@ -81,34 +79,5 @@ def main():
show_histogram(ranks_size)


def show_histogram(buckets, n_bins=DEFAULT_HISTOGRAM_NBINS):
"""A simple histogram CLI visualizer"""
MiB = float(1 << 20)
freq, bins = numpy.histogram(buckets, bins=n_bins)
bin_start = bins[0]
for count, bin_end in zip(freq, bins[1:]):
if count:
print(f" [{bin_start/MiB:5.0f} - {bin_end/MiB:5.0f}]: {count:0d}")
bin_start = bin_end


def get_entry_size(base_dir, dat_entry):
"""Obtain the file size of a dat entry"""
dat_file = f"{dat_entry}_2.dat"
file_path = os.path.join(base_dir, dat_file)
return os.path.getsize(file_path)


def with_progress(elements):
"""A quick and easy generator for displaying progress while iterating"""
total_elems = len(elements)
report_every = math.ceil(total_elems / PROGRESS_STEPS)
print(f"Processing {total_elems} entries")
for i, elem in enumerate(elements):
if i % report_every == 0:
print(f"{i:10} [{i*100/total_elems:3.0f}%]", file=sys.stderr)
yield elem


if __name__ == "__main__":
main()
40 changes: 40 additions & 0 deletions tools/toolbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Blue Brain Project - EPFL, 2024
"""A library of functions shared across tools.
"""

import math
import os
import sys

PROGRESS_STEPS = 50
DEFAULT_HISTOGRAM_NBINS = 40


def show_histogram(buckets, n_bins=DEFAULT_HISTOGRAM_NBINS):
"""A simple histogram CLI visualizer"""
import numpy # optional
MiB = float(1 << 20)
freq, bins = numpy.histogram(buckets, bins=n_bins)
bin_start = bins[0]
for count, bin_end in zip(freq, bins[1:]):
if count:
print(f" [{bin_start/MiB:5.0f} - {bin_end/MiB:5.0f}]: {count:0d}")
bin_start = bin_end


def get_dat_entry_size(base_dir, dat_entry):
"""Obtain the file size of a dat entry"""
dat_file = f"{dat_entry}_2.dat"
file_path = os.path.join(base_dir, dat_file)
return os.path.getsize(file_path)


def with_progress(elements):
"""A quick and easy generator for displaying progress while iterating"""
total_elems = len(elements)
report_every = math.ceil(total_elems / PROGRESS_STEPS)
print(f"Processing {total_elems} entries")
for i, elem in enumerate(elements):
if i % report_every == 0:
print(f"{i:10} [{i*100/total_elems:3.0f}%]", file=sys.stderr)
yield elem

0 comments on commit 5a4b89c

Please sign in to comment.