From 37b498b053d547a0c4b47531979722ced6dfae2a Mon Sep 17 00:00:00 2001 From: Antonio Bellotta Date: Wed, 18 Oct 2023 17:37:43 +0200 Subject: [PATCH 1/4] Added nodes suggestions and fix negative memory values --- neurodamus/cell_distributor.py | 2 +- neurodamus/node.py | 1 + neurodamus/utils/memory.py | 36 ++++++++++++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/neurodamus/cell_distributor.py b/neurodamus/cell_distributor.py index ed9a0520..2fd5f21c 100644 --- a/neurodamus/cell_distributor.py +++ b/neurodamus/cell_distributor.py @@ -313,7 +313,7 @@ def store_metype_stats(metype, n_cells): memory_allocated = end_memory - prev_memory log_all(logging.DEBUG, " * METype %s: %.1f KiB averaged over %d cells", metype, memory_allocated/n_cells, n_cells) - memory_dict[metype] = memory_allocated / n_cells + memory_dict[metype] = max(0, memory_allocated / n_cells) prev_memory = end_memory for gid, cell_info in gid_info_items: diff --git a/neurodamus/node.py b/neurodamus/node.py index bccf243e..8ad19c15 100644 --- a/neurodamus/node.py +++ b/neurodamus/node.py @@ -1958,6 +1958,7 @@ def run(self): if SimConfig.dry_run: log_stage("============= DRY RUN (SKIP SIMULATION) =============") self._dry_run_stats.display_total() + self._dry_run_stats.display_node_suggestions() return if not SimConfig.simulate_model: self.sim_init() diff --git a/neurodamus/utils/memory.py b/neurodamus/utils/memory.py index 1efb9e76..b857b5ff 100644 --- a/neurodamus/utils/memory.py +++ b/neurodamus/utils/memory.py @@ -8,6 +8,7 @@ import math import os import json +import psutil from ..core import MPI, NeurodamusCore as Nd, run_only_rank0 @@ -184,6 +185,7 @@ def __init__(self) -> None: self.metype_memory = {} self.metype_counts = Counter() self.synapse_counts = Counter() + self.grand_total = 0 _, _, self.base_memory, _ = get_task_level_mem_usage() @run_only_rank0 @@ -267,7 +269,37 @@ def display_total(self): logging.info("| {:<40s} | {:12.1f} |".format("Cells", self.cell_memory_total)) logging.info("| {:<40s} | {:12.1f} |".format("Synapses", self.synapse_memory_total)) logging.info("+{:-^57}+".format("")) - grand_total = full_overhead + self.cell_memory_total + self.synapse_memory_total - grand_total = pretty_printing_memory_mb(grand_total) + self.grand_total = full_overhead + self.cell_memory_total + self.synapse_memory_total + grand_total = pretty_printing_memory_mb(self.grand_total) logging.info("| {:<40s} | {:>12s} |".format("GRAND TOTAL", grand_total)) logging.info("+{:-^57}+".format("")) + + def total_memory_available(): + """ + Returns the total memory available in the system in MB + """ + try: + virtual_memory = psutil.virtual_memory() + return virtual_memory.total / (1024 * 1024) # Total available memory in MB + except Exception as e: + logging.error(f"Error: {e}") + return None + + @run_only_rank0 + def display_node_suggestions(self): + """ + Display suggestions for how many nodes are approximately + necessary to run the simulation based on the memory available + on the current node. + """ + node_total_memory = DryRunStats.total_memory_available() + if node_total_memory is None: + logging.warning("Unable to get the total memory available on the current node.") + return + suggested_nodes = math.ceil(self.grand_total / node_total_memory) + logging.info(f"Based on the memory available on the current node, " + f"it is suggested to use at least {suggested_nodes} node(s).") + logging.info("This is just a suggestion and the actual number of nodes " + "needed to run the simulation may be different.") + logging.info(f"The calculation was based on a total memory available of " + f"{pretty_printing_memory_mb(node_total_memory)} on the current node.") From 47b486e808fdaa709500bd092c392df9734712ee Mon Sep 17 00:00:00 2001 From: Antonio Bellotta Date: Fri, 20 Oct 2023 17:22:33 +0200 Subject: [PATCH 2/4] Improved nodes calculation to take into account variable overhead --- neurodamus/utils/memory.py | 39 ++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/neurodamus/utils/memory.py b/neurodamus/utils/memory.py index b857b5ff..db4c56b3 100644 --- a/neurodamus/utils/memory.py +++ b/neurodamus/utils/memory.py @@ -9,6 +9,7 @@ import os import json import psutil +import multiprocessing from ..core import MPI, NeurodamusCore as Nd, run_only_rank0 @@ -185,7 +186,6 @@ def __init__(self) -> None: self.metype_memory = {} self.metype_counts = Counter() self.synapse_counts = Counter() - self.grand_total = 0 _, _, self.base_memory, _ = get_task_level_mem_usage() @run_only_rank0 @@ -269,8 +269,8 @@ def display_total(self): logging.info("| {:<40s} | {:12.1f} |".format("Cells", self.cell_memory_total)) logging.info("| {:<40s} | {:12.1f} |".format("Synapses", self.synapse_memory_total)) logging.info("+{:-^57}+".format("")) - self.grand_total = full_overhead + self.cell_memory_total + self.synapse_memory_total - grand_total = pretty_printing_memory_mb(self.grand_total) + grand_total = full_overhead + self.cell_memory_total + self.synapse_memory_total + grand_total = pretty_printing_memory_mb(grand_total) logging.info("| {:<40s} | {:>12s} |".format("GRAND TOTAL", grand_total)) logging.info("+{:-^57}+".format("")) @@ -285,6 +285,35 @@ def total_memory_available(): logging.error(f"Error: {e}") return None + @run_only_rank0 + def suggest_nodes(self, margin): + """ + A function to calculate the suggested number of nodes to run the simulation + The function takes into account the fact that the memory overhead is + variable with the amount of ranks the simulation it's ran with. + One can also specify a custom margin to add to the memory usage. + """ + + try: + ranks_per_node = os.cpu_count() + except AttributeError: + ranks_per_node = multiprocessing.cpu_count() + + full_overhead = self.base_memory * ranks_per_node + + # initialize variable for iteration + est_nodes = 0 + prev_est_nodes = None + + while prev_est_nodes is None or est_nodes != prev_est_nodes: + prev_est_nodes = est_nodes + mem_usage_per_node = full_overhead + self.cell_memory_total + self.synapse_memory_total + mem_usage_with_margin = mem_usage_per_node * (1 + margin) + est_nodes = math.ceil(mem_usage_with_margin / DryRunStats.total_memory_available()) + full_overhead = self.base_memory * ranks_per_node * est_nodes + + return est_nodes + @run_only_rank0 def display_node_suggestions(self): """ @@ -296,10 +325,12 @@ def display_node_suggestions(self): if node_total_memory is None: logging.warning("Unable to get the total memory available on the current node.") return - suggested_nodes = math.ceil(self.grand_total / node_total_memory) + suggested_nodes = self.suggest_nodes(0.3) logging.info(f"Based on the memory available on the current node, " f"it is suggested to use at least {suggested_nodes} node(s).") logging.info("This is just a suggestion and the actual number of nodes " "needed to run the simulation may be different.") logging.info(f"The calculation was based on a total memory available of " f"{pretty_printing_memory_mb(node_total_memory)} on the current node.") + logging.info("Please remember that it is suggested to use the same class of nodes " + "for both the dryrun and the actual simulation.") From 3de919a0fd9b4ac5643b5fc0b6b22ba80525da74 Mon Sep 17 00:00:00 2001 From: Antonio Bellotta Date: Tue, 24 Oct 2023 17:26:25 +0200 Subject: [PATCH 3/4] Updated docs on dry run, providing explaination for node suggestion --- docs/architecture.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/architecture.rst b/docs/architecture.rst index 8d248432..6e8bc177 100644 --- a/docs/architecture.rst +++ b/docs/architecture.rst @@ -329,6 +329,16 @@ will get a summary of the estimated memory used for cells and synapses, includin memory necessary to load libraries and neurodamus data structures. A grand total is provided to the user as well as a per-cell type and per-synapse type breakdown. +At the end of the execution the user will also be provided with a suggestion on how many nodes +to use in order to run the simulation with the given circuit on the given machine. +Keep in mind that this is just a suggestion and the user is free to use a different number of nodes +if he/she wishes to do so. The suggestion is based on the assumption that the user wants to run +the simulation on the same kind of machine used to run the dry run. The suggestion is also based +on the assumption that the user wants to use all the available memory on each node for the simulation. +The node estimate takes into account the memory usage of the cells and synapses as well as the +variable usage of memory "overhead" that is fixed for each rank but varies depending on the number +of ranks used. + In this paragraph we will go a bit more into details on how the estimation is done. Below you can see the workflow of the dry run mode: From 006b91c2ceef93431fb6ee7ee60762a1f4225b3c Mon Sep 17 00:00:00 2001 From: Antonio Bellotta Date: Wed, 25 Oct 2023 12:05:20 +0200 Subject: [PATCH 4/4] Put maximum number of iterations for nodes suggestion loop --- neurodamus/utils/memory.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/neurodamus/utils/memory.py b/neurodamus/utils/memory.py index db4c56b3..b0ae5d4f 100644 --- a/neurodamus/utils/memory.py +++ b/neurodamus/utils/memory.py @@ -304,13 +304,16 @@ def suggest_nodes(self, margin): # initialize variable for iteration est_nodes = 0 prev_est_nodes = None + max_iter = 5 + iter_count = 0 - while prev_est_nodes is None or est_nodes != prev_est_nodes: + while (prev_est_nodes is None or est_nodes != prev_est_nodes) and iter_count < max_iter: prev_est_nodes = est_nodes mem_usage_per_node = full_overhead + self.cell_memory_total + self.synapse_memory_total mem_usage_with_margin = mem_usage_per_node * (1 + margin) est_nodes = math.ceil(mem_usage_with_margin / DryRunStats.total_memory_available()) full_overhead = self.base_memory * ranks_per_node * est_nodes + iter_count += 1 return est_nodes