From b6ea7dd691e0d5f9ef617e6cb396c3fb49b89327 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Thu, 5 Oct 2023 14:49:23 +0200
Subject: [PATCH 1/7] new feature added: removing top mem level when low-level
 mem size is enough for current layer processing

---
 zigzag/api.py                                |  76 +++++
 zigzag/classes/stages/RemoveNoUseMemStage.py | 176 ++++++++++++
 zigzag/classes/stages/SearchNoUseMemStage.py | 277 +++++++++++++++++++
 zigzag/classes/stages/__init__.py            |   2 +
 4 files changed, 531 insertions(+)
 create mode 100644 zigzag/classes/stages/RemoveNoUseMemStage.py
 create mode 100644 zigzag/classes/stages/SearchNoUseMemStage.py

diff --git a/zigzag/api.py b/zigzag/api.py
index c1b8cb25..344853a2 100644
--- a/zigzag/api.py
+++ b/zigzag/api.py
@@ -155,6 +155,82 @@ def get_hardware_performance_zigzag_pe_array_scaling(
     return cmes[0][0].energy_total, cmes[0][0].latency_total2, cmes
 
 
+def get_hardware_performance_zigzag_unused_mem_removing(
+    workload,
+    accelerator,
+    mapping,
+    opt="latency",
+    dump_filename_pattern="outputs/{datetime}.json",
+    pickle_filename="outputs/list_of_cmes.pickle",
+):
+    # Initialize the logger
+    import logging as _logging
+
+    _logging_level = _logging.INFO
+    _logging_format = (
+        "%(asctime)s - %(funcName)s +%(lineno)s - %(levelname)s - %(message)s"
+    )
+    _logging.basicConfig(level=_logging_level, format=_logging_format)
+
+    # Sanity check on the optimization criterion
+    if opt == "energy":
+        opt_stage = MinimalEnergyStage
+    elif opt == "latency":
+        opt_stage = MinimalLatencyStage
+    elif opt == "EDP":
+        opt_stage = MinimalEDPStage
+    else:
+        raise NotImplementedError(
+            "Optimization criterion 'opt' should be either 'energy' or 'latency' or 'EDP'."
+        )
+
+    # Check workload format and based on it select the correct workload parser stage
+    try:
+        if workload.split(".")[-1] == "onnx":
+            workload_parser_stage = ONNXModelParserStage
+        else:
+            workload_parser_stage = WorkloadParserStage
+    except:
+        workload_parser_stage = WorkloadParserStage
+
+    mainstage = MainStage(
+        [  # Initialize the MainStage as entry point
+            workload_parser_stage,  # Parse the ONNX Model into the workload
+            AcceleratorParserStage,  # Parse the accelerator module/passthrough given accelerator
+            SimpleSaveStage,  # Save the summed CME energy and latency to a json
+            PickleSaveStage,  # Save all received CMEs in a list to a pickle file
+            SumStage,  # Sum up the received best CME across all layers of the workload
+            SearchNoUseMemStage,  # Search for unused memory instance
+            WorkloadStage,  # Iterate through the different layers in the workload
+            RemoveNoUseMemStage,  # Remove unused memory instance
+            CompleteSaveStage,  # Save each processed layer to a json
+            opt_stage,  # Reduce all CMEs, returning minimal energy/latency one
+            SpatialMappingGeneratorStage,  # Generate multiple spatial mappings (SM)
+            opt_stage,  # Reduce all CMEs, returning minimal energy/latency one
+            LomaStage,  # Generate multiple temporal mappings (TM)
+            # TemporalOrderingConversionStage,  # Based on the fixed temporal mapping order, generate one temporal mapping (TM)
+            CostModelStage,  # Evaluate generated SM and TM through cost model
+        ],
+        accelerator=accelerator,  # required by AcceleratorParserStage
+        workload=workload,  # required by workload_parser_stage
+        mapping=mapping,  # required by workload_parser_stage
+        dump_filename_pattern=dump_filename_pattern,  # output file save pattern
+        pickle_filename=pickle_filename,  # filename for pickled list of cmes
+        loma_lpf_limit=6,  # required by LomaStage
+        loma_show_progress_bar=True,
+        # If we need access the same input data multiple times from the innermost memory level and the data size is smaller than the memory read bw,
+        # take into account only one-time access cost (assume the data can stay at the output pins of the memory as long as it is needed).
+        # By default, if the parameter is not defined, it will be set as False internally.
+        access_same_data_considered_as_no_access=True,
+    )
+
+    # Launch the MainStage
+    answers = mainstage.run()
+    # Get CME from answer
+    cmes = answers
+
+    return cmes[0][0].energy_total, cmes[0][0].latency_total2, cmes
+
 if __name__ == "__main__":
     workload = "zigzag/inputs/examples/workload/mobilenetv2.onnx"
     # workload = 'inputs.examples.workload.resnet18'
diff --git a/zigzag/classes/stages/RemoveNoUseMemStage.py b/zigzag/classes/stages/RemoveNoUseMemStage.py
new file mode 100644
index 00000000..da93d969
--- /dev/null
+++ b/zigzag/classes/stages/RemoveNoUseMemStage.py
@@ -0,0 +1,176 @@
+from zigzag.classes.hardware.architecture.accelerator import Accelerator
+from zigzag.classes.hardware.architecture.core import Core
+from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy
+from zigzag.utils import pickle_deepcopy
+from zigzag.classes.stages.Stage import Stage
+from typing import Generator
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+#################### Description ####################
+## This stage must be processed behind WorkloadStage.
+## This stage removes non-useful memory level found by SearchNoUseMemStage.
+################### Pseudo-code ####################
+## Initialization:
+##  target_act_mem_level, target_output_mem_level: get from mem_update_list
+##  target_const_mem_level = mem_udpate_weight
+## 1. Modify mem structure:
+## for mem in mem_levels(sort_order: from bottom to top):
+##   if ['I'] in mem.served_operand and mem.mem_level > target_act_mem_level:
+##     remove ['I'] in mem.served_operand, mem_port_alloc
+##   if ['O'] in mem.served_operand and mem.mem_level > target_output_mem_level:
+##     remove ['O'] in mem.served_operand, mem_port_alloc
+##   if ['W'] in mem.served_operand and mem.mem_level > target_const_mem_level:
+##     remove ['W'] in mem.served_operand, mem_port_alloc
+## 2. Remove no-use memory
+## for mem in mem_levels(sort_order: from top to bottom):
+##   if mem.served_operand == empty:
+##     do not add the current mem into the modified architecture
+#####################################################
+
+class RemoveNoUseMemStage(Stage):
+    def __init__(self, list_of_callables, *, accelerator, layer, mem_update_list, mem_update_weight, layer_list, **kwargs):
+        super().__init__(list_of_callables, **kwargs)
+        self.accelerator = accelerator
+        self.layer = layer
+        self.layer_list = layer_list
+        self.mem_update_list = mem_update_list
+        self.mem_update_weight = mem_update_weight
+
+    def run(self) -> Generator:
+        modified_accelerator = self.generate_accelerator_removing_nouse_mem()
+        sub_stage = self.list_of_callables[0](self.list_of_callables[1:],
+                accelerator=modified_accelerator,
+                layer=self.layer,
+                **self.kwargs,)
+        for cme, extra_info in sub_stage.run():
+            yield cme, extra_info
+
+    def generate_accelerator_removing_nouse_mem(self):
+        ## Remove nouse memory level according to update_mem_list and mem_update_weight
+        curr_id = self.layer_list[self.layer]  # current layer id (key) in mem_udpate_list
+        output_operand = self.layer.memory_operand_links[self.layer.output_operand]  # output representation
+        core = next(iter(self.accelerator.cores))
+        operational_array = core.operational_array
+        memory_hierarchy = core.memory_hierarchy
+
+        if len(self.layer.constant_operands) == 1:
+            act_operand = self.layer.memory_operand_links[[operand for operand in self.layer.input_operands if operand not in self.layer.constant_operands][0]]  # act representation
+            const_operand = self.layer.memory_operand_links[self.layer.constant_operands[0]]  # weight representation
+        elif len(self.layer.constant_operands) == 0:
+            # special case when defining workload manually:
+            # the constant operands list is empty for such as "Adder" layers
+            # for input operand, we will represent all inputs as one input, since only their data size is used for required mem size calculation.
+            act_operand = self.layer.memory_operand_links[self.layer.input_operands[0]]
+            const_operand = self.layer.memory_operand_links[self.layer.input_operands[1]]
+        else:
+            # special case when defining workload manually:
+            # both I and W are considered as constant operands for the first layer
+            pr_loop_keys = tuple(self.layer.pr_loop.keys())
+            for operand, related_loop in self.layer.operand_dimensionality_order.items():
+                if pr_loop_keys[0] in related_loop:
+                    act_operand = operand
+            weight_operand: list = [x for x in self.layer.constant_operands if x != act_operand]
+            assert len(weight_operand) == 1
+            weight_operand: str = weight_operand[0]
+            act_operand = self.layer.memory_operand_links[act_operand]  # map from layer representation to hardware representation
+            const_operand = self.layer.memory_operand_links[weight_operand]  # weight representation
+
+        # Find target_act/const/output_mem_level
+        for pos, ele in enumerate(self.mem_update_list[f"{curr_id}"]):
+            if list(ele.keys())[0] == f"{act_operand}":
+                target_act_mem_level = self.mem_update_list[f"{curr_id}"][pos][f"{act_operand}"]
+            if list(ele.keys())[0] == f"{output_operand}":
+                target_output_mem_level = self.mem_update_list[f"{curr_id}"][pos][f"{output_operand}"]
+        if len(self.layer.constant_operands) == 0:
+            # special case when defining workload manually:
+            # the constant operands list is empty for such as "Adder" layers
+            # Here we make a trick: treating the other input as const_operand
+            for pos, ele in enumerate(self.mem_update_list[f"{curr_id}"]):
+                if list(ele.keys())[0] == f"{act_operand}":
+                    target_const_mem_level = self.mem_update_list[f"{curr_id}"][pos][f"{act_operand}"]
+        else:
+            target_const_mem_level = self.mem_update_weight
+
+        # Initialize the new memory hierarchy
+        mh_name = memory_hierarchy.name
+        new_mh_name = mh_name + "-removing-nouse-mem"
+        new_memory_hierarchy = MemoryHierarchy(operational_array, new_mh_name)
+
+        # Add memories to the new memory hierarchy with the correct attributes
+        for curr_mem_level, memory_level in enumerate(memory_hierarchy.mem_level_list):
+            memory_instance = memory_level.memory_instance
+            operands = tuple(memory_level.operands)
+            port_alloc = memory_level.port_alloc_raw
+            served_dimensions_vec = memory_level.served_dimensions_vec
+            assert len(served_dimensions_vec) >= 1
+            served_dimensions = served_dimensions_vec[0]
+
+            new_memory_instance = pickle_deepcopy(memory_instance)
+            new_operands = []
+            new_port_alloc = []
+            if (act_operand in operands) and curr_mem_level <= target_act_mem_level:
+                new_operands.append(act_operand)
+                index_in_operands = operands.index(act_operand)
+                new_port_alloc.append(port_alloc[index_in_operands])
+            if (const_operand in operands) and curr_mem_level <= target_const_mem_level:
+                new_operands.append(const_operand)
+                index_in_operands = operands.index(const_operand)
+                new_port_alloc.append(port_alloc[index_in_operands])
+            if (output_operand in operands) and curr_mem_level <= target_output_mem_level:
+                new_operands.append(output_operand)
+                index_in_operands = operands.index(output_operand)
+                new_port_alloc.append(port_alloc[index_in_operands])
+            new_operands = tuple(new_operands)
+            new_port_alloc = tuple(new_port_alloc)
+            new_served_dimensions = pickle_deepcopy(served_dimensions)
+            if len(new_operands) > 0:
+                new_memory_hierarchy.add_memory(
+                    memory_instance=new_memory_instance,
+                    operands=new_operands,
+                    port_alloc=new_port_alloc,
+                    served_dimensions=new_served_dimensions,
+                )
+
+        # Create the new core
+        id = core.id
+        dataflows = core.dataflows
+        if dataflows is not None:
+            raise NotImplementedError(
+                "Scale your core-defined dataflows accordingly here."
+            )
+
+        new_id = id
+        new_dataflows = pickle_deepcopy(dataflows)
+        new_core = Core(
+            id=new_id,
+            operational_array=operational_array,
+            memory_hierarchy=new_memory_hierarchy,
+            dataflows=new_dataflows,
+        )
+
+        # Create the new accelerator
+        name = self.accelerator.name
+        new_name = name + "-removing-nouse-mem"
+        new_cores = {new_core}
+        new_accelerator = Accelerator(
+            name=new_name,
+            core_set=new_cores,
+        )
+
+        logger.info(f"Update mem architecture for layer {self.layer}...")
+
+        # RemoveNoUseMemStage.visulize_modified_memory_structure(new_memory_hierarchy)
+
+        return new_accelerator
+
+    @staticmethod
+    def visulize_modified_memory_structure(new_memory_hierarchy):
+        # Visualization for debugging
+        from zigzag.visualization.graph.memory_hierarchy import (
+            visualize_memory_hierarchy_graph,
+        )
+
+        visualize_memory_hierarchy_graph(new_memory_hierarchy)
\ No newline at end of file
diff --git a/zigzag/classes/stages/SearchNoUseMemStage.py b/zigzag/classes/stages/SearchNoUseMemStage.py
new file mode 100644
index 00000000..fda9f34e
--- /dev/null
+++ b/zigzag/classes/stages/SearchNoUseMemStage.py
@@ -0,0 +1,277 @@
+from zigzag.classes.stages.Stage import Stage
+
+import networkx as nx
+from typing import Generator
+from zigzag.classes.workload.dummy_node import DummyNode
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+#################### Description ####################
+## This stage must be processed before WorkloadStage.
+## This stage figures out the no-use top memory levels for "I", "W", "O" when the size of lower memory level is enough to hold all data, considering the output data of previous layer can be directly used by next layer. As an impact, the energy / latency related to these memories will be removed.
+## The general criteria is:
+##      If a low-level memory size is big enough to hold both "I" and "O" data of current layer, memory above this one will be labeled as no-use.
+##      If a low-level memory size is big enough to hold "W" data of entire workload, memory above this one will be labeled as no-use.
+## The above method only applies layers along the same branch, otherwise (for branch starting nodes or branch final nodes) the "O" data will return back to the top possible memory.
+## In RemoveNoUseMemStage, no-use mem across all layers, labeled in this stage, will be removed in the memory architecture.
+## For now, the number of cores must be 1.
+#################### Pseudo-code ####################
+## Initialization:
+##   mem_update_list = [layer_ids: {"I" / "O": -1}] ## mem level of different operands of each layer (there should be no -1 after self.update_top_mem_level())
+##   each_layer_IO_data_size = [layer_ids: {"I" / "O": size}] ## input / output data size of each layer
+##   mem_update_weight = top_mem_level ## top mem level to put weight
+##   weight_size_entire_workload = weight_size # weight data size of entire workload
+## Generate:
+##   layer_execution_order = list( topological_sort(layer_gragh) )
+## Locate top mem level for each operand of each layer. Store results in mem_update_list and mem_update_weight.
+##   for layer in all_layers:
+##     if layer.index != 0: ## not the 1st execution layer
+##       mem_udpate_list[layer]["I"] = mem_udpate_list[previous_layer]["O"]
+##     if len(layer.next_node) > 1 or len(next_layer.prevous_node) > 1: ## starting node of branches / final node of branches
+##     | if layer.index == 0:
+##     |   mem_update_list[layer]["I" / "O"] updates to the top input/output mem level
+##     | else:
+##     |   mem_update_list[layer]["O"] updates to the top output mem level
+##     |   mem_update_weight = top weight mem level, if mem_update_weight > top weight mem level
+##     |
+##     else:
+##       for mem in mem_levels(sort_order: from top to bottom):
+##         if sum(layer[operand_size] for operand in mem.operands) <= mem.size:
+##           if ["I", "O"] both in mem.operands:
+##             mem_update_list[layer]["O"] = current_mem_level
+##             if layer.index == 0: ## the 1st execution layer
+##               mem_update_list[layer]["I"] = current_mem_level
+##           if ("W" in mem.operand) and (current_mem_level < mem_update_weight):
+##             mem_update_weight = current_mem_level
+#####################################################
+
+class SearchNoUseMemStage(Stage):
+    def __init__(self, list_of_callables, *, accelerator, workload, **kwargs):
+        super().__init__(list_of_callables, **kwargs)
+        self.accelerator = accelerator
+        self.workload = workload
+        ## Initialization
+        self.mem_update_list = {}
+        self.each_layer_IO_data_size = {} # unit: bit
+        core_id = accelerator.cores[0].id # correct only for single-core hardware
+        self.core_mem_level_list = accelerator.get_core(core_id=core_id).memory_hierarchy.mem_level_list
+        self.mem_update_weight = len(self.core_mem_level_list)-1 # index of the top memory
+        self.weight_size_entire_workload = 0 # unit: bit
+        self.layer_list = {} # layer name and its corresponding id
+        core = accelerator.get_core(core_id=core_id)
+        for id, layer in enumerate(nx.topological_sort(workload)):
+            if type(layer) != DummyNode: # create record on memory level, data size of each operand for un-dummy nodes
+                # identify the weight operand
+                if len(layer.constant_operands) == 1:
+                    weight_operand = layer.constant_operands[0]
+                else:
+                    if len(layer.constant_operands) == 0:
+                        # special case when defining workload manually:
+                        # the constant operands list is empty for such as "Adder" layers
+                        # for input operand, we will represent all inputs as one input, since only their data size is used for required mem size calculation.
+                        input_operand = layer.input_operands[0]
+                        output_operand = layer.output_operand
+                        input_data_size = 0
+                        for operand in layer.input_operands:
+                            input_data_size += layer.operand_size_bit[operand]
+                        self.mem_update_list[f"{id}"] = [{operand: -1} for operand in core.mem_hierarchy_dict.keys() if operand in [layer.memory_operand_links[output_operand], layer.memory_operand_links[input_operand]]]
+                        self.each_layer_IO_data_size[f"{id}"] = [{layer.memory_operand_links[output_operand]: layer.operand_size_bit[output_operand],
+                                                                  layer.memory_operand_links[input_operand]: input_data_size
+                                                                  }]
+                        self.layer_list[layer] = id
+                        continue
+                    else:
+                        # special case when defining workload manually:
+                        # both I and W are considered as constant operands for the first layer
+                        pr_loop_keys = tuple(layer.pr_loop.keys())
+                        for operand, related_loop in layer.operand_dimensionality_order.items():
+                            if pr_loop_keys[0] in related_loop:
+                                act_operand = operand
+                        weight_operand: list = [x for x in layer.constant_operands if x != act_operand]
+                        assert len(weight_operand) == 1
+                        weight_operand: str = weight_operand[0]
+                self.mem_update_list[f"{id}"] = [{operand: -1} for operand in core.mem_hierarchy_dict.keys() if operand != layer.memory_operand_links[weight_operand]]
+                self.each_layer_IO_data_size[f"{id}"] = [{layer.memory_operand_links[operand]: layer.operand_size_bit[operand] for operand in layer.memory_operand_links.keys() if operand != weight_operand}]
+                self.weight_size_entire_workload += layer.operand_size_bit[weight_operand]
+                self.layer_list[layer] = id
+
+    def run(self, workload_data_always_from_top_mem=False) -> Generator:
+        self.update_top_mem_level() # figure out the lowest possible mem level for all operands for all layers
+
+        if workload_data_always_from_top_mem:
+            # [OPTIONAL] re-define the input/output mem level of first/last layer to the top possible mem level. This
+            # is specially designed for the case that workload input and output must be stored in the top mem level.
+            self.update_mem_level_for_loading_data()
+
+        sub_stage = self.list_of_callables[0](self.list_of_callables[1:], 
+                accelerator=self.accelerator, 
+                workload=self.workload,
+                mem_update_list=self.mem_update_list,
+                mem_update_weight=self.mem_update_weight,
+                layer_list=self.layer_list,
+                **self.kwargs,)
+        for cme, (layer, extra_info) in sub_stage.run():
+            yield cme, (layer, extra_info)
+
+    def update_top_mem_level(self):
+        """
+        Update mem_update_list and mem_update_weight according to the algorithm description at the file beginning.
+        """
+        """
+        param const_operand: constant operand name (e.g. "W")
+        param act_operand: activation operand name (e.g. "I")
+        param output_operand: output operand name (e.g. "O")
+        """
+        self.remove_dummy_nodes_in_workload() # remove dummy nodes for the ease of telling the branch starting or final nodes
+
+        ## Update mem_update_list and mem_update_weight
+        for id, layer in enumerate(nx.topological_sort(self.workload)):
+            branch_starting_node = True if self.workload.out_degree(layer) > 1 else False # starting node of branches
+            branch_final_node = True if self.workload.out_degree(layer) == 1 and self.workload.in_degree(list(self.workload.successors(layer))[0]) > 1 else False
+            output_operand = layer.memory_operand_links[layer.output_operand]  # output representation
+            curr_id = self.layer_list[layer]  # current layer id (key) in mem_udpate_list
+            if len(layer.constant_operands) == 1:
+                const_operand = layer.memory_operand_links[layer.constant_operands[0]]  # weight representation
+                act_operand = layer.memory_operand_links[
+                    [operand for operand in layer.input_operands if operand not in layer.constant_operands][0]]  # act representation
+            else:
+                if len(layer.constant_operands) == 0:
+                    # special case when defining workload manually:
+                    # the constant operands list is empty for such as "Adder" layers
+                    const_operand = None
+                    act_operand = layer.memory_operand_links[layer.input_operands[0]]
+                else:
+                    # special case when defining workload manually:
+                    # both I and W are considered as constant operands for the first layer
+                    pr_loop_keys = tuple(layer.pr_loop.keys())
+                    for operand, related_loop in layer.operand_dimensionality_order.items():
+                        if pr_loop_keys[0] in related_loop:
+                            act_operand = operand
+                    weight_operand: list = [x for x in layer.constant_operands if x != act_operand]
+                    weight_operand: str = weight_operand[0]
+                    act_operand = layer.memory_operand_links[act_operand]  # map from layer representation to hardware representation
+                    const_operand = layer.memory_operand_links[weight_operand]  # weight representation
+            if id != 0:  ## not the first layer
+                ## Assign mem_udpate_list[layer]["I"] = mem_udpate_list[previous_layer]["O"]
+                prev_layer = list(self.workload.predecessors(layer))[0]  # previous layer node (object)
+                prev_layer_id = self.layer_list[prev_layer]  # previous layer id
+                prev_layer_output_operand = prev_layer.output_operand  # output representation of previous layer
+                for ele in self.mem_update_list[f"{prev_layer_id}"]:  # find the output mem level of previous layer
+                    try:
+                        prev_layer_output_level = ele[f"{prev_layer_output_operand}"]
+                    except KeyError:  # skip if the key is incorrect, as there will only be one that match.
+                        pass
+                self.update_IO_mem_level(curr_id, act_operand, prev_layer_output_level) # update the input mem level of current layer
+            if branch_starting_node or branch_final_node:  ## branch starting node or branch final node or permited dummy nodes (e.g. Adder layer)
+                ## Update input, weight, output mem level for branch starting node and branch final node
+                ## Find the top mem level for input if it is the first layer, update mem_udpate_list of current layer
+                if id==0: ## the first layer
+                    for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
+                        served_operands = list(mem.mem_level_of_operands.keys()) # Check the served operand of current mem
+                        if act_operand in served_operands:
+                            self.update_IO_mem_level(curr_id, act_operand, curr_mem_level) # update the input mem level of current layer if it is the first layer
+                            break
+                ## Find the top mem level for output, update mem_update_list of current layer
+                for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
+                    served_operands = list(mem.mem_level_of_operands.keys()) # Check the served operand of current mem
+                    if output_operand in served_operands:
+                        self.update_IO_mem_level(curr_id, output_operand, curr_mem_level) # update the output mem level of current layer
+                        break
+                ## Find the top mem level for weight, update mem_update_weight of current layer to the top weight mem level if mem_update_weight is bigger
+                for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
+                    served_operands = list(mem.mem_level_of_operands.keys()) # Check the served operand of current mem
+                    if const_operand in served_operands: # identify the top weight mem level
+                        if curr_mem_level < self.mem_update_weight: # mem_update_weight is bigger than the top weight mem level
+                            self.mem_update_weight = curr_mem_level
+                        break
+            else: ## node (layer) that is not a branch starting node or a branch final node
+                ## Iterate the memory level and update input, weight, output mem level
+                for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
+                    served_operands = list(mem.mem_level_of_operands.keys()) # Check the served operand of current mem
+                    ## Update input, weight, output mem level
+                    avail_mem_size = mem.memory_instance.size  # available hardware mem size
+                    if len(layer.constant_operands) == 0:  # Adder layer: multiple act operands
+                        mem_serve_act = False
+                        for layer_act_operand in layer.input_operands:
+                            if layer.memory_operand_links[layer_act_operand] in served_operands:
+                                mem_serve_act = True
+                                # modify to match the keys used in each_layer_IO_data_size
+                                served_operands = [output_operand, layer.memory_operand_links[layer.input_operands[0]]]
+                    else:
+                        mem_serve_act = True if (act_operand in served_operands) else False
+                    mem_serve_io_both = True if mem_serve_act and (output_operand in served_operands) else False # ["I", "O"] both in mem.served_operands
+                    mem_serve_weight = True if (const_operand in served_operands) else False # mem.served_operands = ["W"]
+                    if mem_serve_io_both or mem_serve_weight:
+                        required_IO_data_size = sum([self.each_layer_IO_data_size[f"{curr_id}"][0][operand] for operand in served_operands if operand != const_operand])
+                        required_weight_size = self.weight_size_entire_workload if const_operand in served_operands else 0
+                        required_total_size = required_IO_data_size + required_weight_size # required size to put data in current mem level
+                        if required_total_size <= avail_mem_size: # sum(layer[operand_size] for operand in mem.operands) <= mem.size
+                            if mem_serve_io_both:
+                                if id == 0:
+                                    self.update_IO_mem_level(curr_id, act_operand, curr_mem_level) # update input mem level
+                                self.update_IO_mem_level(curr_id, output_operand, curr_mem_level) # update output mem level
+                            if (curr_mem_level < self.mem_update_weight) and mem_serve_weight:  # update weight mem level
+                                self.mem_update_weight = curr_mem_level
+        ## [OPTIONAL CHECK] assert check if there is -1 value in mem_update_list
+        ## [NOTE] Until here, if there is still -1 value in mem_update_list, it means the size of top mem level for IO is not big enough.
+        for layer_ele in self.mem_update_list.values():
+            for operand_dict in layer_ele:
+                assert list(operand_dict.values())[0] >= 0
+
+    def update_mem_level_for_loading_data(self):
+        """
+        [OPTIONAL FUNCTION] This is an optional function.
+        Depending on your requirement, sometimes data loading from the top mem level and offloading to the top mem level is a must.
+        If that is the your case, add this function to self.run().
+        Otherwise, if the input is generated on-chip at the lowest possible input mem level and the output is stored on-chip at the lowest possible output mem level, remove this function from self.run().
+        [FUNCTION OBJECT]
+        Update mem_update_list of first and last layer, so that the input data of first layer still is loaded from top input mem level and the output of last layer still is offloaded to top output mem level
+        """
+        self.remove_dummy_nodes_in_workload()  # remove dummy nodes for the ease of telling the branch starting or final nodes
+
+        ## Update mem_update_list and mem_update_weight
+        for id, layer in enumerate(nx.topological_sort(self.workload)):
+            act_operand = layer.memory_operand_links[[operand for operand in layer.input_operands if operand not in layer.constant_operands][0]]  # act representation
+            output_operand = layer.output_operand  # output representation
+            curr_id = self.layer_list[layer]  # current layer id (key) in mem_udpate_list
+            if id == 0: # the first layer: update activation mem level to the top possible mem level
+                for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
+                    served_operands = list(mem.mem_level_of_operands.keys())  # Check the served operand of current mem
+                    if act_operand in served_operands:
+                        self.update_IO_mem_level(curr_id, act_operand, curr_mem_level)  # update the input mem level of current layer if it is the first layer
+                        break
+            if id == len(self.layer_list) - 1: # the last layer: update output mem level to the top possible mem level
+                for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
+                    served_operands = list(mem.mem_level_of_operands.keys())  # Check the served operand of current mem
+                    if output_operand in served_operands:
+                        self.update_IO_mem_level(curr_id, output_operand, curr_mem_level)  # update the output mem level of current layer if it is the last layer
+                        break
+    def remove_dummy_nodes_in_workload(self):
+        ## Remove dummy nodes (layers) in the graph (assume there is no branch from a non-dummy node to dummy node)
+        ## Redirect the outgoing edges of dummy nodes to non-dummy nodes
+        ## Algorithm:
+        ## for each dummy node, add edges between its predecessor nodes and successor nodes; then remove the dummy node.
+        #############################################
+        ## Comment on the following 4 lines below: visualize the network for debugging
+        ## import matplotlib.pyplot as plt
+        ## pos = nx.spring_layout(self.workload)
+        ## nx.draw(self.workload, pos, with_labels=True, node_color="lightblue", font_weight="bold")
+        ## plt.show()
+        #############################################
+        dummy_nodes = [node for node in self.workload.nodes() if type(node) == DummyNode]
+        for dummy_node in dummy_nodes:
+            for successor_node in list(self.workload.successors(dummy_node)):
+                for predecessor_node in list(self.workload.predecessors(dummy_node)):
+                    self.workload.add_edge(predecessor_node, successor_node)
+        self.workload.remove_nodes_from(dummy_nodes)
+
+    def update_IO_mem_level(self, layer_id, operand, target_level):
+        """
+        Update self.mem_update_list as:
+        self.mem_update_list[layer_id][operand_index][operand] = target_level
+        """
+        for pos, ele in enumerate(self.mem_update_list[f"{layer_id}"]):
+            if list(ele.keys())[0] == f"{operand}":
+                self.mem_update_list[f"{layer_id}"][pos][f"{operand}"] = target_level
\ No newline at end of file
diff --git a/zigzag/classes/stages/__init__.py b/zigzag/classes/stages/__init__.py
index efe948b0..fafd26ea 100644
--- a/zigzag/classes/stages/__init__.py
+++ b/zigzag/classes/stages/__init__.py
@@ -26,6 +26,8 @@
 from .Stage import Stage, MainStage
 from .TemporalOrderingConversionStage import TemporalOrderingConversionStage
 from .WorkloadStage import WorkloadStage
+from .RemoveNoUseMemStage import RemoveNoUseMemStage
+from .SearchNoUseMemStage import SearchNoUseMemStage
 
 # Parameter providers: these parameters are provided to substages by the following classes:
 #  - accelerator: AcceleratorParserStage

From a81cb3f51e09f325ef495c895145fa42478c2899 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Sat, 14 Oct 2023 19:49:21 +0200
Subject: [PATCH 2/7] reform code style to black, add pytest case accordingly,
 resnet18 has not passed the test

---
 .test_ascend_like.py.swp                      | Bin 0 -> 12288 bytes
 main.py                                       |   2 +-
 main_onnx.py                                  |   2 +-
 test_ascend_like.py                           |  47 ++
 .../.test_tpu_like.py.swp                     | Bin 0 -> 12288 bytes
 .../test_ascend_like.py                       |  38 ++
 .../test_edge_tpu_like.py                     |  38 ++
 .../test_meta_prototype_like.py               |  38 ++
 .../test_tesla_npu_like.py                    |  38 ++
 .../test_tpu_like.py                          |  38 ++
 zigzag/api.py                                 |   6 +-
 .../stages/.RemoveUnusedMemoryStage.py.swp    | Bin 0 -> 24576 bytes
 zigzag/classes/stages/.WorkloadStage.py.swp   | Bin 0 -> 12288 bytes
 ...MemStage.py => RemoveUnusedMemoryStage.py} | 116 +++--
 zigzag/classes/stages/SearchNoUseMemStage.py  | 277 -----------
 .../classes/stages/SearchUnusedMemoryStage.py | 444 ++++++++++++++++++
 zigzag/classes/stages/__init__.py             |   4 +-
 17 files changed, 767 insertions(+), 321 deletions(-)
 create mode 100644 .test_ascend_like.py.swp
 create mode 100644 test_ascend_like.py
 create mode 100644 tests/main/test_without_unused_memory/.test_tpu_like.py.swp
 create mode 100644 tests/main/test_without_unused_memory/test_ascend_like.py
 create mode 100644 tests/main/test_without_unused_memory/test_edge_tpu_like.py
 create mode 100644 tests/main/test_without_unused_memory/test_meta_prototype_like.py
 create mode 100644 tests/main/test_without_unused_memory/test_tesla_npu_like.py
 create mode 100644 tests/main/test_without_unused_memory/test_tpu_like.py
 create mode 100644 zigzag/classes/stages/.RemoveUnusedMemoryStage.py.swp
 create mode 100644 zigzag/classes/stages/.WorkloadStage.py.swp
 rename zigzag/classes/stages/{RemoveNoUseMemStage.py => RemoveUnusedMemoryStage.py} (65%)
 delete mode 100644 zigzag/classes/stages/SearchNoUseMemStage.py
 create mode 100644 zigzag/classes/stages/SearchUnusedMemoryStage.py

diff --git a/.test_ascend_like.py.swp b/.test_ascend_like.py.swp
new file mode 100644
index 0000000000000000000000000000000000000000..89ef1eeb6370d6b7c5424922f4d04663ca877488
GIT binary patch
literal 12288
zcmeI2ON%2_6vyv~3_fO56h?4$O~6iuva0f~3^XX>!i7s`GYmC1mD5Re*Nb~A-RTxz
z3qf4D@e?@U)}<&oxD;_GxO3wtnE3?$tE(y<XSy>%+&G*H|5PRSo^#Ln-Kqm2XX4&`
z?|tdnuE6n<5P#jfPJcZ8<&|q+y(;$2A}PLp+!2R?GH*sIGo%kYHW`)M2aAXnhm_b8
zTA6=bICP->zzNbR8E2wZ63qR*RQro;s`4<I`4eTxKM7|iYUZq67n2U0oJ{U36Hqes
zqws+2?0Ds#FPH!m*ht{2=yf}%k9xE5x_ss34>r!`K_<Wim;e)C0!)AjFaajO1fDDc
zd3H&Bf|Q=B(sHWvrH9VdM?NtDCcp%k025#WOn?b60Vco%m;e)C0{=q-YKs5-pAq86
zXAwO9|F3`lKX_4y`_NtJ8|Z834zvT^e?f?!p?lCz(5H}r-hh6;D#Yi|5b8rcs0+OY
z{dq-*Z=suz1GS+Rv;$p(E<?XPFT}6VFVOeUU1%SgLwnEwx(t2yoDg3?pF!_HJJ18<
zP~`DLkqaM8fC(@GCcp%k026rf2s9-QWzck)%Sh#v1jllBS7ygWlccS(OsC5hEi(#o
zLbqf&qb=6WC?P%DWK~Y6j4~Z2d8_=&s+X+B&K9c2A6Z%MN=i%-h`DuyX3i*8)7xb@
zm6IZ&dc=|{2q+@0@>E+gMn32cEvG*mgC&EQOdAW#DEH?|Pmh!)KO>E%(HJeJbwjCt
z6z21Ek^75eVE`qjIMv53(VWtrEPG3{u(dga_6^Z|tAZRWePCxwtC(^fo=~e%+cvCK
zCI;CRuOd55`9deMQTDgmVUjI!W7AT_SwzOJL$}{PYjJIq(XG*Ft%tA|F5!~4UZMhN
zuQkzDD^w%rdKaRIzJ2;$T-9k_uuK2>234_Phc7~RZeDB9Hkh7Iy-pE^^`*Jx-bNky
z=oG_6fz*?2O(vnd<27!`R=4kZ!%?T(>)E63z;)feCHtP+?~I3SYu%kw-V#oIOE|SV
zr*>zjNwO?_c1G^l^~Sx?$o6`c><zr0+aGP~E>0(5gl9EB?4Eh_yiU&>c1B}+)Uo8S
z?+u6EU|WwWq9y#TybXr^@nA6U#(jHiNpFZiJr~aEA^@--;;DGIs!%8nr^QGm)4H}u
zJid}tFkkI}#Df|pG?i+S9-_0nyd$^YijsA+GT0XOwpNcY-PZ6Q=qyq1xdEaUe)sA$
ZmQ}@2St!FeOHuRxRjm`EvMwu__#5$kcUu4e

literal 0
HcmV?d00001

diff --git a/main.py b/main.py
index 14f13214..8f6dee76 100644
--- a/main.py
+++ b/main.py
@@ -49,4 +49,4 @@
 )
 
 # Launch the MainStage
-mainstage.run()
\ No newline at end of file
+mainstage.run()
diff --git a/main_onnx.py b/main_onnx.py
index 526abd29..de286087 100644
--- a/main_onnx.py
+++ b/main_onnx.py
@@ -45,4 +45,4 @@
 )
 
 # Launch the MainStage
-mainstage.run()
\ No newline at end of file
+mainstage.run()
diff --git a/test_ascend_like.py b/test_ascend_like.py
new file mode 100644
index 00000000..6cd2ddb7
--- /dev/null
+++ b/test_ascend_like.py
@@ -0,0 +1,47 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    #"zigzag/inputs/examples/workload/alexnet.onnx",
+    #"zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    #"zigzag/inputs/examples/workload/resnet18.onnx",
+    "zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5649555894.9, 8637780),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1881386179.71, 6486685),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1709089377.83, 3583047),
+    "zigzag.inputs.examples.workload.resnet18": (2408671233.7250004, 4804196),
+}
+
+
+workload = workloads[0]
+accelerator = "zigzag.inputs.examples.hardware.Ascend_like"
+mapping = "zigzag.inputs.examples.mapping.ascend_like"
+(energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+
+
+#@pytest.fixture
+#def mapping():
+#    return "zigzag.inputs.examples.mapping.ascend_like"
+#
+#
+#@pytest.fixture
+#def accelerator():
+#    return "zigzag.inputs.examples.hardware.Ascend_like"
+#
+#
+#@pytest.mark.parametrize("workload", workloads)
+#def test_api(workload, accelerator, mapping):
+#    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+#        workload, accelerator, mapping
+#    )
+#    (expected_energy, expected_latency) = ens_lats[workload]
+#    print(energy, latency)
+#    assert energy == pytest.approx(expected_energy)
+#    assert latency == pytest.approx(expected_latency)
diff --git a/tests/main/test_without_unused_memory/.test_tpu_like.py.swp b/tests/main/test_without_unused_memory/.test_tpu_like.py.swp
new file mode 100644
index 0000000000000000000000000000000000000000..6d6f0b6540d5d109039abd523870bb3375ea1dc0
GIT binary patch
literal 12288
zcmeI2%ZemL6ozA5&P^FXS4vn7Ql;6Em#(VL9B9ymi-8ebIBrCesLWH<Ih{8mBD<@5
zE`m?s#+@sjN0<kg!G#EdOK*?hO3{ToPu{9adq#`dNuPk9%*xF3N1QL?6clx4&IdQ%
z<-Twjju#mF^Sd|TXZFkGmp*-&-PToJU;eL=C6UzIsVp_9Lsx*7MjTWrRELm@8LYzZ
z*M>Igc4S9I4BDP4nMeEKyij3P#?mDDBAiJL;YqSMkqdk6yng7wHlU5RGnwSgX?T>F
z{h~5ql~)?#FoUd6$L%dsR$-bPfGCewE1(l1a83zSX1?uTVLP7Nl={}-HU8qYTjvx8
zrA-8g01+SpM1Tko0U|&I&L;s=USJ<%>d$rQzup}${NvbtqyrHk0z`la5CI}U1c(3;
zAOb{y2oM1x@INFVW5(XT$k-pK`v3p__y2FtGxjU$0qRTCXDEppp?<&0*jK2>s7I(z
zP!(zm_3ahL?xI|jjS{GB)GMf~s2?vg_5<pB)YqtosQajUsJp0}s8><nTw?4C)aR%W
z^(Lxb<4^TEX%GP-Km>>Y5g-CYfCvzQQv&>SNR2J2hNsd%9v$<&JzgGT3u*npQ*Id{
z%Tg7~AuLOX4EB_E!6UX71v#k2so&Y4_JKC$)T+H|@iX3hk3+8Osq}|O*yoxSGXAKU
zu(hbC?yp;1MiHc-q$!leGh8pW%{C7Y`w6-=TRh4@kI;PqCft`QK9UN;5)?MmX4rb$
zH`+eAk9NpnnDe^jHk4&D{PzWnuCuoqxJG1B9f(pYnSoKs2@D5)=)hW~p-1?}>bw+~
zO67dew)~1DFDs)3EM-=vphdrG@&0?antt9vv#K=)!?mgB$r9b_mg5bqJ9MJ&6mFfn
zt50^l*Z4kdt$mt8fzH9W{@^+vdR{Oc2kzK$1s*Iu4xGU8c1PB_mEERo_i5X`mEBv3
zBF~q#pF5od-ozVwo^aeL`Z<o{`N5`sSusme+$ZMHTUiDZZ|eK*cp@BU8sM<F=lZ+Y
zD>~lPR;I8-)0O39ygQlh1Uvp1v-TVZ6ZeDNAec<ivES>sG4HS~{!UkFSOZO2$vj?_
zEN)Z|GTLA58;*N2$sy))RvhAsZVO~<^H=gr+&2t519!u?ztOW;|ME2g8ZtIlMaH}G
Wz(V9nRu=fJ_DnU+SQp*aD*Fp%Z6iwn

literal 0
HcmV?d00001

diff --git a/tests/main/test_without_unused_memory/test_ascend_like.py b/tests/main/test_without_unused_memory/test_ascend_like.py
new file mode 100644
index 00000000..cedd830e
--- /dev/null
+++ b/tests/main/test_without_unused_memory/test_ascend_like.py
@@ -0,0 +1,38 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    "zigzag/inputs/examples/workload/alexnet.onnx",
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    "zigzag/inputs/examples/workload/resnet18.onnx",
+    #"zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5649555894.9, 8637780),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1881386179.71, 6486685),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1709089377.83, 3583047),
+    "zigzag.inputs.examples.workload.resnet18": (2408671233.7250004, 4804196),
+}
+
+
+@pytest.fixture
+def mapping():
+    return "zigzag.inputs.examples.mapping.ascend_like"
+
+
+@pytest.fixture
+def accelerator():
+    return "zigzag.inputs.examples.hardware.Ascend_like"
+
+
+@pytest.mark.parametrize("workload", workloads)
+def test_api(workload, accelerator, mapping):
+    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+    (expected_energy, expected_latency) = ens_lats[workload]
+    assert energy == pytest.approx(expected_energy)
+    assert latency == pytest.approx(expected_latency)
diff --git a/tests/main/test_without_unused_memory/test_edge_tpu_like.py b/tests/main/test_without_unused_memory/test_edge_tpu_like.py
new file mode 100644
index 00000000..59bdaa34
--- /dev/null
+++ b/tests/main/test_without_unused_memory/test_edge_tpu_like.py
@@ -0,0 +1,38 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    "zigzag/inputs/examples/workload/alexnet.onnx",
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    "zigzag/inputs/examples/workload/resnet18.onnx",
+    #"zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5568602396.684999, 8134431),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (751128562.4699999, 2427487),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1784539639.4349997, 3176546),
+    "zigzag.inputs.examples.workload.resnet18": (2413350265.7900004, 4314851),
+}
+
+
+@pytest.fixture
+def mapping():
+    return "zigzag.inputs.examples.mapping.edge_tpu_like"
+
+
+@pytest.fixture
+def accelerator():
+    return "zigzag.inputs.examples.hardware.Edge_TPU_like"
+
+
+@pytest.mark.parametrize("workload", workloads)
+def test_api(workload, accelerator, mapping):
+    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+    (expected_energy, expected_latency) = ens_lats[workload]
+    assert energy == pytest.approx(expected_energy)
+    assert latency == pytest.approx(expected_latency)
diff --git a/tests/main/test_without_unused_memory/test_meta_prototype_like.py b/tests/main/test_without_unused_memory/test_meta_prototype_like.py
new file mode 100644
index 00000000..49d7491b
--- /dev/null
+++ b/tests/main/test_without_unused_memory/test_meta_prototype_like.py
@@ -0,0 +1,38 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    "zigzag/inputs/examples/workload/alexnet.onnx",
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    "zigzag/inputs/examples/workload/resnet18.onnx",
+    #"zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5679695605.4400015, 8299150),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (901092009.6000001, 2610609),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1730672410.3200004, 3262009),
+    "zigzag.inputs.examples.workload.resnet18": (2419893343.4549994, 4176163),
+}
+
+
+@pytest.fixture
+def mapping():
+    return "zigzag.inputs.examples.mapping.meta_prototype_like"
+
+
+@pytest.fixture
+def accelerator():
+    return "zigzag.inputs.examples.hardware.Meta_prototype"
+
+
+@pytest.mark.parametrize("workload", workloads)
+def test_api(workload, accelerator, mapping):
+    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+    (expected_energy, expected_latency) = ens_lats[workload]
+    assert energy == pytest.approx(expected_energy)
+    assert latency == pytest.approx(expected_latency)
diff --git a/tests/main/test_without_unused_memory/test_tesla_npu_like.py b/tests/main/test_without_unused_memory/test_tesla_npu_like.py
new file mode 100644
index 00000000..37080e6e
--- /dev/null
+++ b/tests/main/test_without_unused_memory/test_tesla_npu_like.py
@@ -0,0 +1,38 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    "zigzag/inputs/examples/workload/alexnet.onnx",
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    "zigzag/inputs/examples/workload/resnet18.onnx",
+    #"zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (6040086796.366001, 8389669),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (930702060.6110002, 1965457),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681.4799998, 3257898),
+    "zigzag.inputs.examples.workload.resnet18": (2375316568.8910007, 4082454),
+}
+
+
+@pytest.fixture
+def mapping():
+    return "zigzag.inputs.examples.mapping.tesla_npu_like"
+
+
+@pytest.fixture
+def accelerator():
+    return "zigzag.inputs.examples.hardware.Tesla_NPU_like"
+
+
+@pytest.mark.parametrize("workload", workloads)
+def test_api(workload, accelerator, mapping):
+    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+    (expected_energy, expected_latency) = ens_lats[workload]
+    assert energy == pytest.approx(expected_energy)
+    assert latency == pytest.approx(expected_latency)
diff --git a/tests/main/test_without_unused_memory/test_tpu_like.py b/tests/main/test_without_unused_memory/test_tpu_like.py
new file mode 100644
index 00000000..3386b816
--- /dev/null
+++ b/tests/main/test_without_unused_memory/test_tpu_like.py
@@ -0,0 +1,38 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    "zigzag/inputs/examples/workload/alexnet.onnx",
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    "zigzag/inputs/examples/workload/resnet18.onnx",
+    #"zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5475639384.492001, 8979956),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (952688145.0069999, 21873214),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1659252422.016, 4000289),
+    "zigzag.inputs.examples.workload.resnet18": (2296491401.491, 4909027),
+}
+
+
+@pytest.fixture
+def mapping():
+    return "zigzag.inputs.examples.mapping.tpu_like"
+
+
+@pytest.fixture
+def accelerator():
+    return "zigzag.inputs.examples.hardware.TPU_like"
+
+
+@pytest.mark.parametrize("workload", workloads)
+def test_api(workload, accelerator, mapping):
+    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+    (expected_energy, expected_latency) = ens_lats[workload]
+    assert energy == pytest.approx(expected_energy)
+    assert latency == pytest.approx(expected_latency)
diff --git a/zigzag/api.py b/zigzag/api.py
index 344853a2..4e0d8f37 100644
--- a/zigzag/api.py
+++ b/zigzag/api.py
@@ -155,7 +155,7 @@ def get_hardware_performance_zigzag_pe_array_scaling(
     return cmes[0][0].energy_total, cmes[0][0].latency_total2, cmes
 
 
-def get_hardware_performance_zigzag_unused_mem_removing(
+def get_hardware_performance_zigzag_without_unused_memory(
     workload,
     accelerator,
     mapping,
@@ -200,9 +200,9 @@ def get_hardware_performance_zigzag_unused_mem_removing(
             SimpleSaveStage,  # Save the summed CME energy and latency to a json
             PickleSaveStage,  # Save all received CMEs in a list to a pickle file
             SumStage,  # Sum up the received best CME across all layers of the workload
-            SearchNoUseMemStage,  # Search for unused memory instance
+            SearchUnusedMemoryStage,  # Search for unused memory instance
             WorkloadStage,  # Iterate through the different layers in the workload
-            RemoveNoUseMemStage,  # Remove unused memory instance
+            RemoveUnusedMemoryStage,  # Remove unused memory instance
             CompleteSaveStage,  # Save each processed layer to a json
             opt_stage,  # Reduce all CMEs, returning minimal energy/latency one
             SpatialMappingGeneratorStage,  # Generate multiple spatial mappings (SM)
diff --git a/zigzag/classes/stages/.RemoveUnusedMemoryStage.py.swp b/zigzag/classes/stages/.RemoveUnusedMemoryStage.py.swp
new file mode 100644
index 0000000000000000000000000000000000000000..c41e8a561c8d804559f0ef78a09d8dcb5d689970
GIT binary patch
literal 24576
zcmeHPYm6mD6>gM=JXUrgL4RqnBXI9B)3eJXkcmTpg@p~Q7#0@eIJP~v>&|VaZ}+7i
zGdl|lCPXwLJTxH&iGPd&CMuDLMgx%u2`0E|WQnW)1Y<N9qy7MlnE0J~bXRrH3^R)o
zh3@2=d%GT|PMtbcb?Vf)Gq-);?mg^cce{gUv*X<J^fvyo^U5h_-2VaR=B$V*`OThT
z*z>cA$RA}qUEJaF%+KA0BI3m&kKH+5R==GHP69XgCVNT1v&p&Ck9z~JpQK(f3j92b
z=e;>U<KBUA{(wI}X?&+b3zI6n7x`I+`dRMJ<KuchOcwc#agp(055A|%dnv;mEthS#
zUs?&gt0hq6{fUb=Ip?3ZLrnFtt!J{+PrLD5Et0nLtpr*Lv=V3~&`O|{Kr4Y(0<8qz
zZ4$^w$2xbQ)9+Cozes%^Q~Nwly`QB%FRtYutKP3tpEI@m_o{bI|7*2;_1b>45@;pR
zN}!cMD}h!5tpr*Lv=V3~&`O|{Kr4Y(0{;aG_<`d*f4t*7a{`Ft{$HQZe{ibf+y-m|
zjsaeHzvFx#cmjAB_y+J<;0|Cna60hZCdc^_a0hTca4hi0Qyk|zz-NH7ffrA9oQHsG
zfGxnQCppfOz?Xr200TY@d<b~`M9299a0GY?cpUgDa4%2*p9FRR7XTB$YwvTMUjk16
z_X9rQ0u#U{;7{ly_48I>FK`xc0`O<_|0UpY;C|pf;5y(nI6V#nPXTuU^S}gf0`OZn
zJRSuK;0j<Tuo-v}j*e%5M}V7wbAZ!;-^0Q32(Sd~0=57z!pZUl<!E8XGsf}(XT2mw
zctf1CWW*6B5o9b1vz&z)<HJ$D%n%R3vZ6O&e#W+576d%q$|8T6r<r5WVq+{D@m}ah
z3_%*4Ee&|g0^Sc}#9FW=#8gCyAF!bx7k(5iPnC6f6!uxf<4(q-zAFg3nk7%O)MeAt
zY<pFS8lGinbBTxZgPf&&l=6(nIYK*<n1wMLitvoG6{<kbT8+$XpLIkb^+#}_Szyy<
zT}fT%c~KZIWHVM}!JXgR0Bf`L_BAh%kCFq66AVmja?%i$#f2t8W2kA^^}~2n<hmpF
zi0aC!VuP^VVf0m#*@&#xJNP@MFlW`Hq(2CuuW4_vjP`m-%H4W;SwZxGG{cWPKTZ9z
zMkAxFLIgoK=KJ$bnDbPsr{C-G2(9Kx3MGZgb-QH^G2%&)L!Rr?M?)87H7&oAq@BDG
zvG$5I^}>Lv<Y`B})p(^;Oc^<u847n6_;OcG9<K;Se$JuVSzhwP@Kjuv$wp)492ONT
z4H+9_GI$CqjOZyfJu&YWdBh;qBn@bt$P*^wv$%(`6eJi{Bq}ZAwzwxJyl5J$3!CRL
z4F~s(k@1$oeBh-Nt3`2Mg$UOof{51=!`L-+iMq5L@+e@vAs=TL?$r0fxSy20qq$Sd
z6LYYb^F&yfn-*~gB#9C|m6Yji+cqJvg(W|o&&C}}(vMi{rYm{cxkiVkhe;6jLmrsD
zv`b4W7%MO7dp)eM{#?YfnH^KJRk=)aTq;td{Px+7quFaTsxwAu80T_6GzO;X{K}<B
zon#prJ(3)8-enV)u&d-yCaEzxqFaqdz<(pHWV>EPY-UykOQogYXgibYHZ7-vt@l+S
zG%beBZq67h6foYZVg)wWPS^FITVd{b9b&>7t8u7`)DkHlL!)#?HI#w}!DE%y5Xp*J
z)NYj_=v1!;=T+ji>T8qfs)HCtS&#;{?4~0kDAn9=1WtBD_#{n7XY6Pm)ME4miP#za
zAdnV~TG${F%v7u?WaemfX0&BsMepTB$|V8pK`81{R4L||-X~#k2m@$}3eh4U$qCgH
z>sHpuk`#lO!X%nv{WKY3{Tk64HkagiGDN>n^E_9td~zY8`dElG>tdPWZ04Ms&Y4w9
zZ;g?0(c*}j!n{s;CC_Wr(W@qHiJn|!FJU>BAf9{DtSK`DlO)8Flk1LxlU3`i#fjS9
z)?j7#QLwUVowZmY>&50{og*%Xng$(XFGqZb%l6QMy(}~a`)?t1O$vJDLC`Cj+;i0)
zN6M{=TkMVN)Q0@99p+LIsV{0L70VrH?Z(0b^E^BtY-Hl5@vx6)ya<vB3Yun(TqKb0
z-7vt>K<s=<Win=Iw%XMDNr9bvZkg@nG~*g`->uOa7C3awaW+bm9!JnM>heJd5;v){
zgIFu5R`X$3@T`}HBVqhEv%$9$CG+z<rR5u6U>3L#)wLq6<9V?^<et}coN!3<OqX#S
zIjNUNWSgkGa+{KwGEm%Af#R-^OXdn*bq(=il`iry%5?T9>@7sx3pgM35=4@VHfv-D
zej31n;I6rNHSAqYagSJGuGZ%91}F-vS9NqJ?Ar}gHTDk8$7PkZ?9loDFwVL^!#UVI
z{~yAc|88Ixa3OFKa0qAo2Z4RS7T_hE?;isOz~#Ue;0Vt6-v(|6E(0b37uXE^6X*RS
zz_Y*?f&G9FYzO{@^ZpUw=fEN0QQ%HsFK_{{3HUqC_`d>v3OolK1a1S)1Q>7(-~fNY
zS^rzWUBDH<8NeyP$pH297u47GX(iA~pp`%?fmQ<V5DCy}fu6-MgEKcgAiOm=(|Dyv
z&YMsD(V!FaCFOUivbvQYuW}Zwhgx+EH7z8~gD`4c(d?>rQRUkhWBU~E(qT--e!%An
zVH!Hb4V_|%w(Oqvzrd`fewH@<$~~06pi5y3<{>;s`5=MuLqzFGIS!sN9E(e)H>b+i
zj<Tk~e6;LTw51X3l6$Mu-+Ckc5>mM0tTVAV>k7vJyR~wB9CTfG>#Ac%cB7Z^e44sN
zFEv*e8Q3C^zElHQ69TMr!(^;Rt0am7x9XRJju0pQPzupSuJ|~YZJp5GzX{=PoWQtk
zt;)-?qHl)P>Qd#h&=}j9auN^u4$y+-)hSEa8Vqsz{U}+=L|4nVs=r`X9sjHOAj&Ge
zD(8<~rQxNXZlu^1!az_BgDP2z4MHNr-WG1jDspSbkxrRc^_6G|-*Om~k|>LT1Sx=9
zCX_`)6_IDuYsi@jR2oYXqo-<IzRSPLV-ti!9%pdxOTjl%D=C<jE02m0R`t4aape}0
zP3V`Ei>lEK#l2cjDt;}kS&@WVR|6LwE$z5=Oy1S43Dwc8blD~K8#2mn2xEmIZj`a5
zO=K$RpnhJAA}*G`D!ExjGiFuLiFR@7n%#=)k48KWI#mNR@CESwaG#bvo^29oltJUV
zNi2IH(6R4Snj}n_YA)8Q(-m$daRq|mQ_i-JLP*lv*6`6-<47I1iIWddb8HpGkXMtI
z>e2Id3C+3+U7yTqRp|Ub3HxWB?xAu2f0eR@Vx0G1!x{faz&*gHfR6#E0YAn$|4!f*
z;1j?U@H)=-&jMco3ZM)81Lyifz=Oc2fy;rv;T-=x-~k{2dcb+WUvX}yJN^{79=H@Z
z33w7`_6LE_12+I41fIwF{UC4=o#TPW;J>&VD1dq3TwojU2JY$U{{A80THqSs4cyUx
z9SDI70SEXY?&t3U_5jxcCjzhFe*Sr25x5dK4tN?eB02TdoxKv(dOoCD$iJj9)+m^|
z^(JcDa1xt@>HFKIdB(N!`t)wq3M39Aa|6P#ysy<oUsDWNZD#Bk0&_K4Ik&bkpiq4W
zu9fK8yJKVV28BkNx5#_-@B;eJ7Ub&`p%>!jquNKQJT{bRw!@)b>Ssl|Xisq{E*kXh
zg-y)SB<QnR)a!Mt953~ktV()OL>0OJSdHpjTSsb=tG8CmE}IiKzI1_(0&+nYcQ9o6
z%exe;(L64O;^w)dsFb!_b>|{3=!|>3F^2FNDXfrZ;?6)Bmy!l6X)wCNR?#`Qq(>Nu
zpXX^fSL8gahM$zH-a7iY$9DgAglx>{hjB4`hXY|XR}vHCWFkpQeIsj{xWP6xA^rSh
zO+yxyL9L6Hict2WZc~?c5S3kB?|RieQk7M&ON8}~y84#Ghzp>4Jj%E|Qz?}mdw$9^
z`PWz+95Y>iUpFsw<1(XyB4%esNj6&rLTF-S!J$|O;Uq4?F0xLyiwR0$P6iux4@fOR
z3_%JJr`4-c!MZRk1`GwH#>d#z9HA{sIQaPsocRoT)mxat2aY=;1_L^o;M9|%4T@z!
z*q617IJygx+W$ZZ8Oib}0?3H}m9@b($s8(ZKC75^m^5n+#0r|Qx)#_h*or{BtjxHf
zJC4aQeOFvto7sPqWux(m-?y%`V{8{)16J(ge>&l0HK`;-Yk*jl?S<p0q66h3q7N9B
z1g$Bc#U{KY+TEnKAT0VT2LlfM5fh#(>07CVEaC!LU-GH#v=&5WaYN&2h}+6D(l^Sd
z-Tx_y9RmqFbqzDZE+}F)ZTn&EK@oDXx<cLDFZ@yaX~7e-OUhgf2ZC1hSm8FckD_N<
z%cn)lW^PIbxx!wv3Q>df*fLZj>xeAl51JeyY$!=a9xdUteuK-DNAMd3BE6aiR87Rz
z8J*A?Q<_R}rsm5spu|Op$`HGPO@XMG^qX!3MbplpNoXG)O};V2(xNh0Q~7#pa;Rbb
zj8S~cfbA9s-<#k&lrh91!eDS~ueS>;qL>(<jiMi>xaDsS^w9?o*SGn#RZJQij_!&6
zG&*j_!_Mrm*V(tU3fM|zv?;mLs;m|eBfrMe_SsqN;-FSrAxQaF=%5T<l6$$<4%+^g
z!YHaJFvJ$g4L_qHOky%Iu>Fs-ER1`w2;yj&hA@Pq9Bxje9zY3%eo&+a4R6YCDMC^J
zupo00hS)1G7vPw4#r^-U>3of|Hl6>^Qm5H(;{1OAxE}Zja6Iq|&i=mxz6;z3+zXIT
z;8mReUjUv2J_q!Hao~91H}C^I0_4C>;Bw$p;4pjvKLH*FLf~wGeh=_6d;t#wi@=q@
zX5cV*6a1~`q}`TjCD2Nsl|U<jRsyXAS_%ArNI+Y>^x6LWkAC*3&2BorIi`5tlC3X}
zA?r5LsoD1Df2*AAr60lgQII+=l}=`yUqw_wJ-r-Faq2N0;JQX5qpU&%QL6p<AAHoM
nP@tuT52j3RfBt8>X{!4a64yKE=YROkD`tdnOKkX8mtE)ITn~iB

literal 0
HcmV?d00001

diff --git a/zigzag/classes/stages/.WorkloadStage.py.swp b/zigzag/classes/stages/.WorkloadStage.py.swp
new file mode 100644
index 0000000000000000000000000000000000000000..7cee7c54ec6e1f1a31790cc34a9bc13dfcfe955f
GIT binary patch
literal 12288
zcmeI2zmF6*6vqt|6x@+0k&@QzHnXS2dzUl_>8hkT@k2-?5Cxqq?~G^8+{}2eJ;&`u
zftC`{P|{JQNtK?C0{#H#Q$Uq2>GJHEo!!~vl=_r6(pMgP{OtEW@9j1Brki(mZqO@i
zlfd&iA%DNxkbjboYhOM6n%uW_>6icODO4=%<y=%&nu9GSt#E9=&SiZdOE#6A`6GYe
z1av>1#5$37GBu)%_jsxeud75jRc3rDtmH>3I}+Js>D_PIpS0_7F02K6>qG{_UHFu1
zk=*uER?WH_@dXJWfk2>k>E$cy<lAqy!cToTxJ1AF;zxvo1dsp{Kmter2_OL^fCP}h
z|A>IA&XC6t@7XqH+Ky*d#&(JW5<mh-00|%gB!C2v01`j~NB{{S0VHq=35bM{a~BAC
z0rLO<r)>-$J|pBk=pE=ckOdjg1JHHQHPCs`$BTq~0KEad1SKE_S_iFx{#hsFchD2i
zPoPcE8tAVzLSBRZ06ho&3_1&X+xqeh#=`rA4-!BENB{{S0VIF~kN^@mtpw=GGgC58
zXk5sZ^^>Qa#h6C9m`O9H^3WN<RhjA$SvG`zS%38M2C7PG>nRUyv&__GG`iZe=|0kJ
zYGt0XTv^9;%43n|Vwy{PZ*w;~;mg+5-7($R*ncQYX8XS6I?JSC-pME(+%h_r)<Ux{
z4KdJP!oW*W$X^*_gTAp3YOw)4Q%TQhnOoTp<CsM-Xa4-2rAnKF!<YUMO%(SA!E&$*
zw}!W#d9!DZs+qA^SF`AN0I@EiySau;52@W(6?J=(eqR^G?56L8?wt#6xmhKf1dr(U
z_G070NuOSoj9Y?`SXtJEG{VWKJY-H+8lXdCcFwIfuF2WMwoiK0rah4<^;O^#3QL_6
zA;I&_5a0ZDv7xS{iAIb=-6_X;bgWKRE5TpY?e^z`=c}~^HskKDr30Fvhja(-QX*GJ
z&?v4;s?(-M=iag4-f&2N0JNb)oC0<0OdUII2x*q?F87K@p<^6E2dnRVPdTJTII3I#
z>1RgQ*<MIi>GR}&L4sxqI@>!>X=X&V#|UYv+?NWJJ5)Ezq`Mv}KjNXdIUm8FuGHv)
zHjV78ZDeO%42drZ?(d?is33ET+<(%r>40Y;j=4pc5YpPW&djH!bpEnKDl9Dz$-m8=
BzaRhr

literal 0
HcmV?d00001

diff --git a/zigzag/classes/stages/RemoveNoUseMemStage.py b/zigzag/classes/stages/RemoveUnusedMemoryStage.py
similarity index 65%
rename from zigzag/classes/stages/RemoveNoUseMemStage.py
rename to zigzag/classes/stages/RemoveUnusedMemoryStage.py
index da93d969..c6d73541 100644
--- a/zigzag/classes/stages/RemoveNoUseMemStage.py
+++ b/zigzag/classes/stages/RemoveUnusedMemoryStage.py
@@ -11,7 +11,7 @@
 
 #################### Description ####################
 ## This stage must be processed behind WorkloadStage.
-## This stage removes non-useful memory level found by SearchNoUseMemStage.
+## This stage removes unused memory level found by SearchUnusedMemoryStage.
 ################### Pseudo-code ####################
 ## Initialization:
 ##  target_act_mem_level, target_output_mem_level: get from mem_update_list
@@ -24,14 +24,25 @@
 ##     remove ['O'] in mem.served_operand, mem_port_alloc
 ##   if ['W'] in mem.served_operand and mem.mem_level > target_const_mem_level:
 ##     remove ['W'] in mem.served_operand, mem_port_alloc
-## 2. Remove no-use memory
+## 2. Remove unused memory
 ## for mem in mem_levels(sort_order: from top to bottom):
 ##   if mem.served_operand == empty:
 ##     do not add the current mem into the modified architecture
 #####################################################
 
-class RemoveNoUseMemStage(Stage):
-    def __init__(self, list_of_callables, *, accelerator, layer, mem_update_list, mem_update_weight, layer_list, **kwargs):
+
+class RemoveUnusedMemoryStage(Stage):
+    def __init__(
+        self,
+        list_of_callables,
+        *,
+        accelerator,
+        layer,
+        mem_update_list,
+        mem_update_weight,
+        layer_list,
+        **kwargs,
+    ):
         super().__init__(list_of_callables, **kwargs)
         self.accelerator = accelerator
         self.layer = layer
@@ -40,63 +51,97 @@ def __init__(self, list_of_callables, *, accelerator, layer, mem_update_list, me
         self.mem_update_weight = mem_update_weight
 
     def run(self) -> Generator:
-        modified_accelerator = self.generate_accelerator_removing_nouse_mem()
-        sub_stage = self.list_of_callables[0](self.list_of_callables[1:],
-                accelerator=modified_accelerator,
-                layer=self.layer,
-                **self.kwargs,)
+        print(self.mem_update_list)
+        print(self.mem_update_weight)
+        modified_accelerator = self.generate_accelerator_with_removing_unused_memory()
+        sub_stage = self.list_of_callables[0](
+            self.list_of_callables[1:],
+            accelerator=modified_accelerator,
+            layer=self.layer,
+            **self.kwargs,
+        )
         for cme, extra_info in sub_stage.run():
             yield cme, extra_info
 
-    def generate_accelerator_removing_nouse_mem(self):
+    def generate_accelerator_with_removing_unused_memory(self):
         ## Remove nouse memory level according to update_mem_list and mem_update_weight
-        curr_id = self.layer_list[self.layer]  # current layer id (key) in mem_udpate_list
-        output_operand = self.layer.memory_operand_links[self.layer.output_operand]  # output representation
+        curr_id = self.layer_list[
+            self.layer
+        ]  # current layer id (key) in mem_udpate_list
+        curr_id = str(curr_id)
+        output_operand = self.layer.memory_operand_links[
+            self.layer.output_operand
+        ]  # output representation in memory
         core = next(iter(self.accelerator.cores))
         operational_array = core.operational_array
         memory_hierarchy = core.memory_hierarchy
 
         if len(self.layer.constant_operands) == 1:
-            act_operand = self.layer.memory_operand_links[[operand for operand in self.layer.input_operands if operand not in self.layer.constant_operands][0]]  # act representation
-            const_operand = self.layer.memory_operand_links[self.layer.constant_operands[0]]  # weight representation
+            act_operand = self.layer.memory_operand_links[
+                [
+                    operand
+                    for operand in self.layer.input_operands
+                    if operand not in self.layer.constant_operands
+                ][0]
+            ]  # act representation in memory
+            const_operand = self.layer.memory_operand_links[
+                self.layer.constant_operands[0]
+            ]  # weight representation in memory
         elif len(self.layer.constant_operands) == 0:
             # special case when defining workload manually:
             # the constant operands list is empty for such as "Adder" layers
             # for input operand, we will represent all inputs as one input, since only their data size is used for required mem size calculation.
-            act_operand = self.layer.memory_operand_links[self.layer.input_operands[0]]
-            const_operand = self.layer.memory_operand_links[self.layer.input_operands[1]]
+            act_operand = self.layer.memory_operand_links[self.layer.input_operands[0]] # act representation in memory
+            const_operand = self.layer.memory_operand_links[
+                self.layer.input_operands[1]
+            ]  # weight representation in memory
         else:
             # special case when defining workload manually:
             # both I and W are considered as constant operands for the first layer
             pr_loop_keys = tuple(self.layer.pr_loop.keys())
-            for operand, related_loop in self.layer.operand_dimensionality_order.items():
+            for (
+                operand,
+                related_loop,
+            ) in self.layer.operand_dimensionality_order.items():
                 if pr_loop_keys[0] in related_loop:
                     act_operand = operand
-            weight_operand: list = [x for x in self.layer.constant_operands if x != act_operand]
+            weight_operand: list = [
+                x for x in self.layer.constant_operands if x != act_operand
+            ]  # weight representation in layer
             assert len(weight_operand) == 1
             weight_operand: str = weight_operand[0]
-            act_operand = self.layer.memory_operand_links[act_operand]  # map from layer representation to hardware representation
-            const_operand = self.layer.memory_operand_links[weight_operand]  # weight representation
+            act_operand = self.layer.memory_operand_links[
+                act_operand
+            ]  # map from layer representation to hardware memory representation
+            const_operand = self.layer.memory_operand_links[
+                weight_operand
+            ]  # weight representation in memory
 
         # Find target_act/const/output_mem_level
-        for pos, ele in enumerate(self.mem_update_list[f"{curr_id}"]):
-            if list(ele.keys())[0] == f"{act_operand}":
-                target_act_mem_level = self.mem_update_list[f"{curr_id}"][pos][f"{act_operand}"]
-            if list(ele.keys())[0] == f"{output_operand}":
-                target_output_mem_level = self.mem_update_list[f"{curr_id}"][pos][f"{output_operand}"]
+        for pos, ele in enumerate(self.mem_update_list[curr_id]):
+            if list(ele.keys())[0] == act_operand:
+                target_act_mem_level = self.mem_update_list[curr_id][pos][
+                    act_operand
+                ]
+            if list(ele.keys())[0] == output_operand:
+                target_output_mem_level = self.mem_update_list[curr_id][pos][
+                    output_operand
+                ]
         if len(self.layer.constant_operands) == 0:
             # special case when defining workload manually:
             # the constant operands list is empty for such as "Adder" layers
             # Here we make a trick: treating the other input as const_operand
-            for pos, ele in enumerate(self.mem_update_list[f"{curr_id}"]):
-                if list(ele.keys())[0] == f"{act_operand}":
-                    target_const_mem_level = self.mem_update_list[f"{curr_id}"][pos][f"{act_operand}"]
+            for pos, ele in enumerate(self.mem_update_list[curr_id]):
+                if list(ele.keys())[0] == act_operand:
+                    target_const_mem_level = self.mem_update_list[curr_id][pos][
+                        act_operand
+                    ]
         else:
             target_const_mem_level = self.mem_update_weight
 
         # Initialize the new memory hierarchy
         mh_name = memory_hierarchy.name
-        new_mh_name = mh_name + "-removing-nouse-mem"
+        new_mh_name = mh_name + "-without-unused-memory"
         new_memory_hierarchy = MemoryHierarchy(operational_array, new_mh_name)
 
         # Add memories to the new memory hierarchy with the correct attributes
@@ -119,7 +164,9 @@ def generate_accelerator_removing_nouse_mem(self):
                 new_operands.append(const_operand)
                 index_in_operands = operands.index(const_operand)
                 new_port_alloc.append(port_alloc[index_in_operands])
-            if (output_operand in operands) and curr_mem_level <= target_output_mem_level:
+            if (
+                output_operand in operands
+            ) and curr_mem_level <= target_output_mem_level:
                 new_operands.append(output_operand)
                 index_in_operands = operands.index(output_operand)
                 new_port_alloc.append(port_alloc[index_in_operands])
@@ -137,11 +184,6 @@ def generate_accelerator_removing_nouse_mem(self):
         # Create the new core
         id = core.id
         dataflows = core.dataflows
-        if dataflows is not None:
-            raise NotImplementedError(
-                "Scale your core-defined dataflows accordingly here."
-            )
-
         new_id = id
         new_dataflows = pickle_deepcopy(dataflows)
         new_core = Core(
@@ -162,7 +204,7 @@ def generate_accelerator_removing_nouse_mem(self):
 
         logger.info(f"Update mem architecture for layer {self.layer}...")
 
-        # RemoveNoUseMemStage.visulize_modified_memory_structure(new_memory_hierarchy)
+        # RemoveUnusedMemoryStage.visulize_modified_memory_structure(new_memory_hierarchy)
 
         return new_accelerator
 
@@ -173,4 +215,4 @@ def visulize_modified_memory_structure(new_memory_hierarchy):
             visualize_memory_hierarchy_graph,
         )
 
-        visualize_memory_hierarchy_graph(new_memory_hierarchy)
\ No newline at end of file
+        visualize_memory_hierarchy_graph(new_memory_hierarchy)
diff --git a/zigzag/classes/stages/SearchNoUseMemStage.py b/zigzag/classes/stages/SearchNoUseMemStage.py
deleted file mode 100644
index fda9f34e..00000000
--- a/zigzag/classes/stages/SearchNoUseMemStage.py
+++ /dev/null
@@ -1,277 +0,0 @@
-from zigzag.classes.stages.Stage import Stage
-
-import networkx as nx
-from typing import Generator
-from zigzag.classes.workload.dummy_node import DummyNode
-
-import logging
-
-logger = logging.getLogger(__name__)
-
-#################### Description ####################
-## This stage must be processed before WorkloadStage.
-## This stage figures out the no-use top memory levels for "I", "W", "O" when the size of lower memory level is enough to hold all data, considering the output data of previous layer can be directly used by next layer. As an impact, the energy / latency related to these memories will be removed.
-## The general criteria is:
-##      If a low-level memory size is big enough to hold both "I" and "O" data of current layer, memory above this one will be labeled as no-use.
-##      If a low-level memory size is big enough to hold "W" data of entire workload, memory above this one will be labeled as no-use.
-## The above method only applies layers along the same branch, otherwise (for branch starting nodes or branch final nodes) the "O" data will return back to the top possible memory.
-## In RemoveNoUseMemStage, no-use mem across all layers, labeled in this stage, will be removed in the memory architecture.
-## For now, the number of cores must be 1.
-#################### Pseudo-code ####################
-## Initialization:
-##   mem_update_list = [layer_ids: {"I" / "O": -1}] ## mem level of different operands of each layer (there should be no -1 after self.update_top_mem_level())
-##   each_layer_IO_data_size = [layer_ids: {"I" / "O": size}] ## input / output data size of each layer
-##   mem_update_weight = top_mem_level ## top mem level to put weight
-##   weight_size_entire_workload = weight_size # weight data size of entire workload
-## Generate:
-##   layer_execution_order = list( topological_sort(layer_gragh) )
-## Locate top mem level for each operand of each layer. Store results in mem_update_list and mem_update_weight.
-##   for layer in all_layers:
-##     if layer.index != 0: ## not the 1st execution layer
-##       mem_udpate_list[layer]["I"] = mem_udpate_list[previous_layer]["O"]
-##     if len(layer.next_node) > 1 or len(next_layer.prevous_node) > 1: ## starting node of branches / final node of branches
-##     | if layer.index == 0:
-##     |   mem_update_list[layer]["I" / "O"] updates to the top input/output mem level
-##     | else:
-##     |   mem_update_list[layer]["O"] updates to the top output mem level
-##     |   mem_update_weight = top weight mem level, if mem_update_weight > top weight mem level
-##     |
-##     else:
-##       for mem in mem_levels(sort_order: from top to bottom):
-##         if sum(layer[operand_size] for operand in mem.operands) <= mem.size:
-##           if ["I", "O"] both in mem.operands:
-##             mem_update_list[layer]["O"] = current_mem_level
-##             if layer.index == 0: ## the 1st execution layer
-##               mem_update_list[layer]["I"] = current_mem_level
-##           if ("W" in mem.operand) and (current_mem_level < mem_update_weight):
-##             mem_update_weight = current_mem_level
-#####################################################
-
-class SearchNoUseMemStage(Stage):
-    def __init__(self, list_of_callables, *, accelerator, workload, **kwargs):
-        super().__init__(list_of_callables, **kwargs)
-        self.accelerator = accelerator
-        self.workload = workload
-        ## Initialization
-        self.mem_update_list = {}
-        self.each_layer_IO_data_size = {} # unit: bit
-        core_id = accelerator.cores[0].id # correct only for single-core hardware
-        self.core_mem_level_list = accelerator.get_core(core_id=core_id).memory_hierarchy.mem_level_list
-        self.mem_update_weight = len(self.core_mem_level_list)-1 # index of the top memory
-        self.weight_size_entire_workload = 0 # unit: bit
-        self.layer_list = {} # layer name and its corresponding id
-        core = accelerator.get_core(core_id=core_id)
-        for id, layer in enumerate(nx.topological_sort(workload)):
-            if type(layer) != DummyNode: # create record on memory level, data size of each operand for un-dummy nodes
-                # identify the weight operand
-                if len(layer.constant_operands) == 1:
-                    weight_operand = layer.constant_operands[0]
-                else:
-                    if len(layer.constant_operands) == 0:
-                        # special case when defining workload manually:
-                        # the constant operands list is empty for such as "Adder" layers
-                        # for input operand, we will represent all inputs as one input, since only their data size is used for required mem size calculation.
-                        input_operand = layer.input_operands[0]
-                        output_operand = layer.output_operand
-                        input_data_size = 0
-                        for operand in layer.input_operands:
-                            input_data_size += layer.operand_size_bit[operand]
-                        self.mem_update_list[f"{id}"] = [{operand: -1} for operand in core.mem_hierarchy_dict.keys() if operand in [layer.memory_operand_links[output_operand], layer.memory_operand_links[input_operand]]]
-                        self.each_layer_IO_data_size[f"{id}"] = [{layer.memory_operand_links[output_operand]: layer.operand_size_bit[output_operand],
-                                                                  layer.memory_operand_links[input_operand]: input_data_size
-                                                                  }]
-                        self.layer_list[layer] = id
-                        continue
-                    else:
-                        # special case when defining workload manually:
-                        # both I and W are considered as constant operands for the first layer
-                        pr_loop_keys = tuple(layer.pr_loop.keys())
-                        for operand, related_loop in layer.operand_dimensionality_order.items():
-                            if pr_loop_keys[0] in related_loop:
-                                act_operand = operand
-                        weight_operand: list = [x for x in layer.constant_operands if x != act_operand]
-                        assert len(weight_operand) == 1
-                        weight_operand: str = weight_operand[0]
-                self.mem_update_list[f"{id}"] = [{operand: -1} for operand in core.mem_hierarchy_dict.keys() if operand != layer.memory_operand_links[weight_operand]]
-                self.each_layer_IO_data_size[f"{id}"] = [{layer.memory_operand_links[operand]: layer.operand_size_bit[operand] for operand in layer.memory_operand_links.keys() if operand != weight_operand}]
-                self.weight_size_entire_workload += layer.operand_size_bit[weight_operand]
-                self.layer_list[layer] = id
-
-    def run(self, workload_data_always_from_top_mem=False) -> Generator:
-        self.update_top_mem_level() # figure out the lowest possible mem level for all operands for all layers
-
-        if workload_data_always_from_top_mem:
-            # [OPTIONAL] re-define the input/output mem level of first/last layer to the top possible mem level. This
-            # is specially designed for the case that workload input and output must be stored in the top mem level.
-            self.update_mem_level_for_loading_data()
-
-        sub_stage = self.list_of_callables[0](self.list_of_callables[1:], 
-                accelerator=self.accelerator, 
-                workload=self.workload,
-                mem_update_list=self.mem_update_list,
-                mem_update_weight=self.mem_update_weight,
-                layer_list=self.layer_list,
-                **self.kwargs,)
-        for cme, (layer, extra_info) in sub_stage.run():
-            yield cme, (layer, extra_info)
-
-    def update_top_mem_level(self):
-        """
-        Update mem_update_list and mem_update_weight according to the algorithm description at the file beginning.
-        """
-        """
-        param const_operand: constant operand name (e.g. "W")
-        param act_operand: activation operand name (e.g. "I")
-        param output_operand: output operand name (e.g. "O")
-        """
-        self.remove_dummy_nodes_in_workload() # remove dummy nodes for the ease of telling the branch starting or final nodes
-
-        ## Update mem_update_list and mem_update_weight
-        for id, layer in enumerate(nx.topological_sort(self.workload)):
-            branch_starting_node = True if self.workload.out_degree(layer) > 1 else False # starting node of branches
-            branch_final_node = True if self.workload.out_degree(layer) == 1 and self.workload.in_degree(list(self.workload.successors(layer))[0]) > 1 else False
-            output_operand = layer.memory_operand_links[layer.output_operand]  # output representation
-            curr_id = self.layer_list[layer]  # current layer id (key) in mem_udpate_list
-            if len(layer.constant_operands) == 1:
-                const_operand = layer.memory_operand_links[layer.constant_operands[0]]  # weight representation
-                act_operand = layer.memory_operand_links[
-                    [operand for operand in layer.input_operands if operand not in layer.constant_operands][0]]  # act representation
-            else:
-                if len(layer.constant_operands) == 0:
-                    # special case when defining workload manually:
-                    # the constant operands list is empty for such as "Adder" layers
-                    const_operand = None
-                    act_operand = layer.memory_operand_links[layer.input_operands[0]]
-                else:
-                    # special case when defining workload manually:
-                    # both I and W are considered as constant operands for the first layer
-                    pr_loop_keys = tuple(layer.pr_loop.keys())
-                    for operand, related_loop in layer.operand_dimensionality_order.items():
-                        if pr_loop_keys[0] in related_loop:
-                            act_operand = operand
-                    weight_operand: list = [x for x in layer.constant_operands if x != act_operand]
-                    weight_operand: str = weight_operand[0]
-                    act_operand = layer.memory_operand_links[act_operand]  # map from layer representation to hardware representation
-                    const_operand = layer.memory_operand_links[weight_operand]  # weight representation
-            if id != 0:  ## not the first layer
-                ## Assign mem_udpate_list[layer]["I"] = mem_udpate_list[previous_layer]["O"]
-                prev_layer = list(self.workload.predecessors(layer))[0]  # previous layer node (object)
-                prev_layer_id = self.layer_list[prev_layer]  # previous layer id
-                prev_layer_output_operand = prev_layer.output_operand  # output representation of previous layer
-                for ele in self.mem_update_list[f"{prev_layer_id}"]:  # find the output mem level of previous layer
-                    try:
-                        prev_layer_output_level = ele[f"{prev_layer_output_operand}"]
-                    except KeyError:  # skip if the key is incorrect, as there will only be one that match.
-                        pass
-                self.update_IO_mem_level(curr_id, act_operand, prev_layer_output_level) # update the input mem level of current layer
-            if branch_starting_node or branch_final_node:  ## branch starting node or branch final node or permited dummy nodes (e.g. Adder layer)
-                ## Update input, weight, output mem level for branch starting node and branch final node
-                ## Find the top mem level for input if it is the first layer, update mem_udpate_list of current layer
-                if id==0: ## the first layer
-                    for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
-                        served_operands = list(mem.mem_level_of_operands.keys()) # Check the served operand of current mem
-                        if act_operand in served_operands:
-                            self.update_IO_mem_level(curr_id, act_operand, curr_mem_level) # update the input mem level of current layer if it is the first layer
-                            break
-                ## Find the top mem level for output, update mem_update_list of current layer
-                for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
-                    served_operands = list(mem.mem_level_of_operands.keys()) # Check the served operand of current mem
-                    if output_operand in served_operands:
-                        self.update_IO_mem_level(curr_id, output_operand, curr_mem_level) # update the output mem level of current layer
-                        break
-                ## Find the top mem level for weight, update mem_update_weight of current layer to the top weight mem level if mem_update_weight is bigger
-                for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
-                    served_operands = list(mem.mem_level_of_operands.keys()) # Check the served operand of current mem
-                    if const_operand in served_operands: # identify the top weight mem level
-                        if curr_mem_level < self.mem_update_weight: # mem_update_weight is bigger than the top weight mem level
-                            self.mem_update_weight = curr_mem_level
-                        break
-            else: ## node (layer) that is not a branch starting node or a branch final node
-                ## Iterate the memory level and update input, weight, output mem level
-                for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
-                    served_operands = list(mem.mem_level_of_operands.keys()) # Check the served operand of current mem
-                    ## Update input, weight, output mem level
-                    avail_mem_size = mem.memory_instance.size  # available hardware mem size
-                    if len(layer.constant_operands) == 0:  # Adder layer: multiple act operands
-                        mem_serve_act = False
-                        for layer_act_operand in layer.input_operands:
-                            if layer.memory_operand_links[layer_act_operand] in served_operands:
-                                mem_serve_act = True
-                                # modify to match the keys used in each_layer_IO_data_size
-                                served_operands = [output_operand, layer.memory_operand_links[layer.input_operands[0]]]
-                    else:
-                        mem_serve_act = True if (act_operand in served_operands) else False
-                    mem_serve_io_both = True if mem_serve_act and (output_operand in served_operands) else False # ["I", "O"] both in mem.served_operands
-                    mem_serve_weight = True if (const_operand in served_operands) else False # mem.served_operands = ["W"]
-                    if mem_serve_io_both or mem_serve_weight:
-                        required_IO_data_size = sum([self.each_layer_IO_data_size[f"{curr_id}"][0][operand] for operand in served_operands if operand != const_operand])
-                        required_weight_size = self.weight_size_entire_workload if const_operand in served_operands else 0
-                        required_total_size = required_IO_data_size + required_weight_size # required size to put data in current mem level
-                        if required_total_size <= avail_mem_size: # sum(layer[operand_size] for operand in mem.operands) <= mem.size
-                            if mem_serve_io_both:
-                                if id == 0:
-                                    self.update_IO_mem_level(curr_id, act_operand, curr_mem_level) # update input mem level
-                                self.update_IO_mem_level(curr_id, output_operand, curr_mem_level) # update output mem level
-                            if (curr_mem_level < self.mem_update_weight) and mem_serve_weight:  # update weight mem level
-                                self.mem_update_weight = curr_mem_level
-        ## [OPTIONAL CHECK] assert check if there is -1 value in mem_update_list
-        ## [NOTE] Until here, if there is still -1 value in mem_update_list, it means the size of top mem level for IO is not big enough.
-        for layer_ele in self.mem_update_list.values():
-            for operand_dict in layer_ele:
-                assert list(operand_dict.values())[0] >= 0
-
-    def update_mem_level_for_loading_data(self):
-        """
-        [OPTIONAL FUNCTION] This is an optional function.
-        Depending on your requirement, sometimes data loading from the top mem level and offloading to the top mem level is a must.
-        If that is the your case, add this function to self.run().
-        Otherwise, if the input is generated on-chip at the lowest possible input mem level and the output is stored on-chip at the lowest possible output mem level, remove this function from self.run().
-        [FUNCTION OBJECT]
-        Update mem_update_list of first and last layer, so that the input data of first layer still is loaded from top input mem level and the output of last layer still is offloaded to top output mem level
-        """
-        self.remove_dummy_nodes_in_workload()  # remove dummy nodes for the ease of telling the branch starting or final nodes
-
-        ## Update mem_update_list and mem_update_weight
-        for id, layer in enumerate(nx.topological_sort(self.workload)):
-            act_operand = layer.memory_operand_links[[operand for operand in layer.input_operands if operand not in layer.constant_operands][0]]  # act representation
-            output_operand = layer.output_operand  # output representation
-            curr_id = self.layer_list[layer]  # current layer id (key) in mem_udpate_list
-            if id == 0: # the first layer: update activation mem level to the top possible mem level
-                for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
-                    served_operands = list(mem.mem_level_of_operands.keys())  # Check the served operand of current mem
-                    if act_operand in served_operands:
-                        self.update_IO_mem_level(curr_id, act_operand, curr_mem_level)  # update the input mem level of current layer if it is the first layer
-                        break
-            if id == len(self.layer_list) - 1: # the last layer: update output mem level to the top possible mem level
-                for curr_mem_level, mem in reversed(list(enumerate(self.core_mem_level_list))):
-                    served_operands = list(mem.mem_level_of_operands.keys())  # Check the served operand of current mem
-                    if output_operand in served_operands:
-                        self.update_IO_mem_level(curr_id, output_operand, curr_mem_level)  # update the output mem level of current layer if it is the last layer
-                        break
-    def remove_dummy_nodes_in_workload(self):
-        ## Remove dummy nodes (layers) in the graph (assume there is no branch from a non-dummy node to dummy node)
-        ## Redirect the outgoing edges of dummy nodes to non-dummy nodes
-        ## Algorithm:
-        ## for each dummy node, add edges between its predecessor nodes and successor nodes; then remove the dummy node.
-        #############################################
-        ## Comment on the following 4 lines below: visualize the network for debugging
-        ## import matplotlib.pyplot as plt
-        ## pos = nx.spring_layout(self.workload)
-        ## nx.draw(self.workload, pos, with_labels=True, node_color="lightblue", font_weight="bold")
-        ## plt.show()
-        #############################################
-        dummy_nodes = [node for node in self.workload.nodes() if type(node) == DummyNode]
-        for dummy_node in dummy_nodes:
-            for successor_node in list(self.workload.successors(dummy_node)):
-                for predecessor_node in list(self.workload.predecessors(dummy_node)):
-                    self.workload.add_edge(predecessor_node, successor_node)
-        self.workload.remove_nodes_from(dummy_nodes)
-
-    def update_IO_mem_level(self, layer_id, operand, target_level):
-        """
-        Update self.mem_update_list as:
-        self.mem_update_list[layer_id][operand_index][operand] = target_level
-        """
-        for pos, ele in enumerate(self.mem_update_list[f"{layer_id}"]):
-            if list(ele.keys())[0] == f"{operand}":
-                self.mem_update_list[f"{layer_id}"][pos][f"{operand}"] = target_level
\ No newline at end of file
diff --git a/zigzag/classes/stages/SearchUnusedMemoryStage.py b/zigzag/classes/stages/SearchUnusedMemoryStage.py
new file mode 100644
index 00000000..1be0f79d
--- /dev/null
+++ b/zigzag/classes/stages/SearchUnusedMemoryStage.py
@@ -0,0 +1,444 @@
+from zigzag.classes.stages.Stage import Stage
+
+import networkx as nx
+from typing import Generator
+from zigzag.classes.workload.dummy_node import DummyNode
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+#################### Description ####################
+## This stage must be processed before WorkloadStage.
+## This stage figures out the unused memory levels for "I", "W", "O" when the size of lower memory level is enough to hold all data, considering the output data of previous layer can be directly used by next layer. As an impact, the energy / latency related to these memories will be removed.
+## The general criteria is:
+##      If a low-level memory size is big enough to hold both "I" and "O" data of current layer, memory above this one will be labeled as unused.
+##      If a low-level memory size is big enough to hold "W" data of entire workload, memory above this one will be labeled as unused.
+## The above method only applies layers along the same branch, otherwise (for branch starting nodes or branch final nodes) the "O" data will return back to the top possible memory.
+## In RemoveNoUseMemStage, unused mem across all layers, labeled in this stage, will be removed in the memory architecture.
+## For now, the number of cores must be 1.
+#################### Pseudo-code ####################
+## Initialization:
+##   mem_update_list = [layer_ids: {"I" / "O": -1}] ## mem level of different operands of each layer (there should be no -1 after self.update_top_mem_level())
+##   each_layer_IO_data_size = [layer_ids: {"I" / "O": size}] ## input / output data size of each layer
+##   mem_update_weight = top_mem_level ## top mem level to put weight
+##   weight_size_entire_workload = weight_size # weight data size of entire workload
+## Generate:
+##   layer_execution_order = list( topological_sort(layer_gragh) )
+## Locate top mem level for each operand of each layer. Store results in mem_update_list and mem_update_weight.
+##   for layer in all_layers:
+##     if layer.index != 0: ## not the 1st execution layer
+##       mem_udpate_list[layer]["I"] = mem_udpate_list[previous_layer]["O"]
+##     if len(layer.next_node) > 1 or len(next_layer.prevous_node) > 1: ## starting node of branches / final node of branches
+##     | if layer.index == 0:
+##     |   mem_update_list[layer]["I" / "O"] updates to the top input/output mem level
+##     | else:
+##     |   mem_update_list[layer]["O"] updates to the top output mem level
+##     |   mem_update_weight = top weight mem level, if mem_update_weight > top weight mem level
+##     |
+##     else:
+##       for mem in mem_levels(sort_order: from top to bottom):
+##         if sum(layer[operand_size] for operand in mem.operands) <= mem.size:
+##           if ["I", "O"] both in mem.operands:
+##             mem_update_list[layer]["O"] = current_mem_level
+##             if layer.index == 0: ## the 1st execution layer
+##               mem_update_list[layer]["I"] = current_mem_level
+##           if ("W" in mem.operand) and (current_mem_level < mem_update_weight):
+##             mem_update_weight = current_mem_level
+#####################################################
+
+
+class SearchUnusedMemoryStage(Stage):
+    def __init__(self, list_of_callables, *, accelerator, workload, **kwargs):
+        super().__init__(list_of_callables, **kwargs)
+        self.accelerator = accelerator
+        self.workload = workload
+        ## Initialization
+        self.mem_update_list = {}
+        self.each_layer_IO_data_size = {}  # unit: bit
+        core_id = accelerator.cores[0].id  # correct only for single-core hardware
+        self.core_mem_level_list = accelerator.get_core(
+            core_id=core_id
+        ).memory_hierarchy.mem_level_list
+        self.mem_update_weight = (
+            len(self.core_mem_level_list) - 1
+        )  # index of the top memory
+        self.weight_size_entire_workload = 0  # unit: bit
+        self.layer_list = {}  # layer name and its corresponding id
+        core = accelerator.get_core(core_id=core_id)
+        for id, layer in enumerate(nx.topological_sort(workload)):
+            if (
+                type(layer) != DummyNode
+            ):  # create record on memory level, data size of each operand for un-dummy nodes
+                # identify the weight operand
+                if len(layer.constant_operands) == 1:
+                    weight_operand = layer.constant_operands[0]
+                else:
+                    if len(layer.constant_operands) == 0:
+                        # special case when defining workload manually:
+                        # the constant operands list is empty for such as "Adder" layers
+                        # for input operand, we will represent all inputs as one input, since only their data size is used for required mem size calculation.
+                        input_operand = layer.input_operands[0]
+                        output_operand = layer.output_operand
+                        input_data_size = 0
+                        for operand in layer.input_operands:
+                            input_data_size += layer.operand_size_bit[operand]
+                        self.mem_update_list[f"{id}"] = [
+                            {operand: -1}
+                            for operand in core.mem_hierarchy_dict.keys()
+                            if operand
+                            in [
+                                layer.memory_operand_links[output_operand],
+                                layer.memory_operand_links[input_operand],
+                            ]
+                        ]
+                        self.each_layer_IO_data_size[f"{id}"] = [
+                            {
+                                layer.memory_operand_links[
+                                    output_operand
+                                ]: layer.operand_size_bit[output_operand],
+                                layer.memory_operand_links[
+                                    input_operand
+                                ]: input_data_size,
+                            }
+                        ]
+                        self.layer_list[layer] = id
+                        continue
+                    else:
+                        # special case when defining workload manually:
+                        # both I and W are considered as constant operands for the first layer
+                        pr_loop_keys = tuple(layer.pr_loop.keys())
+                        for (
+                            operand,
+                            related_loop,
+                        ) in layer.operand_dimensionality_order.items():
+                            if pr_loop_keys[0] in related_loop:
+                                act_operand = operand
+                        weight_operand: list = [
+                            x for x in layer.constant_operands if x != act_operand
+                        ]
+                        assert len(weight_operand) == 1
+                        weight_operand: str = weight_operand[0]
+                self.mem_update_list[f"{id}"] = [
+                    {operand: -1}
+                    for operand in core.mem_hierarchy_dict.keys()
+                    if operand != layer.memory_operand_links[weight_operand]
+                ]
+                self.each_layer_IO_data_size[f"{id}"] = [
+                    {
+                        layer.memory_operand_links[operand]: layer.operand_size_bit[
+                            operand
+                        ]
+                        for operand in layer.memory_operand_links.keys()
+                        if operand != weight_operand
+                    }
+                ]
+                self.weight_size_entire_workload += layer.operand_size_bit[
+                    weight_operand
+                ]
+                self.layer_list[layer] = id
+
+    def run(self, workload_data_always_from_top_mem=False) -> Generator:
+        self.update_top_mem_level()  # figure out the lowest possible mem level for all operands for all layers
+
+        if workload_data_always_from_top_mem:
+            # [OPTIONAL] re-define the input/output mem level of first/last layer to the top possible mem level. This
+            # is specially designed for the case that workload input and output must be stored in the top mem level.
+            self.update_mem_level_for_loading_data()
+
+        sub_stage = self.list_of_callables[0](
+            self.list_of_callables[1:],
+            accelerator=self.accelerator,
+            workload=self.workload,
+            mem_update_list=self.mem_update_list,
+            mem_update_weight=self.mem_update_weight,
+            layer_list=self.layer_list,
+            **self.kwargs,
+        )
+        for cme, (layer, extra_info) in sub_stage.run():
+            yield cme, (layer, extra_info)
+
+    def update_top_mem_level(self):
+        """
+        Update mem_update_list and mem_update_weight according to the algorithm description at the file beginning.
+        """
+        self.remove_dummy_nodes_in_workload()  # remove dummy nodes for the ease of telling the branch starting or final nodes
+
+        ## Update mem_update_list and mem_update_weight
+        for id, layer in enumerate(nx.topological_sort(self.workload)):
+            branch_starting_node = (
+                True if self.workload.out_degree(layer) > 1 else False
+            )  # starting node of branches
+            branch_final_node = (
+                True
+                if self.workload.out_degree(layer) == 1
+                and self.workload.in_degree(list(self.workload.successors(layer))[0])
+                > 1
+                else False
+            )
+            output_operand = layer.memory_operand_links[
+                layer.output_operand
+            ]  # output representation in memory
+            curr_id = self.layer_list[
+                layer
+            ]  # current layer id (key) in mem_udpate_list
+            if len(layer.constant_operands) == 1:
+                const_operand = layer.memory_operand_links[
+                    layer.constant_operands[0]
+                ]  # weight representation in memory
+                act_operand = layer.memory_operand_links[
+                    [
+                        operand
+                        for operand in layer.input_operands
+                        if operand not in layer.constant_operands
+                    ][0]
+                ]  # act representation in memory
+            else:
+                if len(layer.constant_operands) == 0:
+                    # special case when defining workload manually:
+                    # the constant operands list is empty for such as "Adder" layers
+                    const_operand = None
+                    act_operand = layer.memory_operand_links[layer.input_operands[0]]
+                else:
+                    # special case when defining workload manually:
+                    # both I and W are considered as constant operands for the first layer
+                    pr_loop_keys = tuple(layer.pr_loop.keys())
+                    for (
+                        operand,
+                        related_loop,
+                    ) in layer.operand_dimensionality_order.items():
+                        if pr_loop_keys[0] in related_loop:
+                            act_operand = operand
+                    weight_operand: list = [
+                        x for x in layer.constant_operands if x != act_operand
+                    ]
+                    weight_operand: str = weight_operand[0]
+                    act_operand = layer.memory_operand_links[
+                        act_operand
+                    ]  # map from layer representation to hardware memory representation
+                    const_operand = layer.memory_operand_links[
+                        weight_operand
+                    ]  # weight representation in memory
+            if id != 0:  ## not the first layer
+                ## Assign mem_udpate_list[layer]["I"] = mem_udpate_list[previous_layer]["O"]
+                prev_layer = list(self.workload.predecessors(layer))[
+                    0
+                ]  # previous layer node (object)
+                prev_layer_id = self.layer_list[prev_layer]  # previous layer id
+                prev_layer_output_operand = (
+                    prev_layer.output_operand
+                )  # output representation in memory of previous layer
+                for ele in self.mem_update_list[
+                    f"{prev_layer_id}"
+                ]:  # find the output mem level of previous layer
+                    try:
+                        prev_layer_output_level = ele[f"{prev_layer_output_operand}"]
+                    except (
+                        KeyError
+                    ):  # skip if the key is incorrect, as there will only be one that match.
+                        pass
+                self.update_IO_mem_level(
+                    curr_id, act_operand, prev_layer_output_level
+                )  # update the input mem level of current layer
+            if (
+                branch_starting_node or branch_final_node
+            ):  ## branch starting node or branch final node or permited dummy nodes (e.g. Adder layer)
+                ## Update input, weight, output mem level for branch starting node and branch final node
+                ## Find the top mem level for input if it is the first layer, update mem_udpate_list of current layer
+                if id == 0:  ## the first layer
+                    for curr_mem_level, mem in reversed(
+                        list(enumerate(self.core_mem_level_list))
+                    ):
+                        served_operands = list(
+                            mem.mem_level_of_operands.keys()
+                        )  # Check the served operand of current mem
+                        if act_operand in served_operands:
+                            self.update_IO_mem_level(
+                                curr_id, act_operand, curr_mem_level
+                            )  # update the input mem level of current layer if it is the first layer
+                            break
+                ## Find the top mem level for output, update mem_update_list of current layer
+                for curr_mem_level, mem in reversed(
+                    list(enumerate(self.core_mem_level_list))
+                ):
+                    served_operands = list(
+                        mem.mem_level_of_operands.keys()
+                    )  # Check the served operand of current mem
+                    if output_operand in served_operands:
+                        self.update_IO_mem_level(
+                            curr_id, output_operand, curr_mem_level
+                        )  # update the output mem level of current layer
+                        break
+                ## Find the top mem level for weight, update mem_update_weight of current layer to the top weight mem level if mem_update_weight is bigger
+                for curr_mem_level, mem in reversed(
+                    list(enumerate(self.core_mem_level_list))
+                ):
+                    served_operands = list(
+                        mem.mem_level_of_operands.keys()
+                    )  # Check the served operand of current mem
+                    if (
+                        const_operand in served_operands
+                    ):  # identify the top weight mem level
+                        if (
+                            curr_mem_level < self.mem_update_weight
+                        ):  # mem_update_weight is bigger than the top weight mem level
+                            self.mem_update_weight = curr_mem_level
+                        break
+            else:  ## node (layer) that is not a branch starting node or a branch final node
+                ## Iterate the memory level and update input, weight, output mem level
+                for curr_mem_level, mem in reversed(
+                    list(enumerate(self.core_mem_level_list))
+                ):
+                    served_operands = list(
+                        mem.mem_level_of_operands.keys()
+                    )  # Check the served operand of current mem
+                    ## Update input, weight, output mem level
+                    avail_mem_size = (
+                        mem.memory_instance.size
+                    )  # available hardware mem size
+                    if (
+                        len(layer.constant_operands) == 0
+                    ):  # Adder layer: multiple act operands
+                        mem_serve_act = False
+                        for layer_act_operand in layer.input_operands:
+                            if (
+                                layer.memory_operand_links[layer_act_operand]
+                                in served_operands
+                            ):
+                                mem_serve_act = True
+                                # modify to match the keys used in each_layer_IO_data_size
+                                served_operands = [
+                                    output_operand,
+                                    layer.memory_operand_links[layer.input_operands[0]],
+                                ]
+                    else:
+                        mem_serve_act = (
+                            True if (act_operand in served_operands) else False
+                        )
+                    mem_serve_io_both = (
+                        True
+                        if mem_serve_act and (output_operand in served_operands)
+                        else False
+                    )  # ["I", "O"] both in mem.served_operands
+                    mem_serve_weight = (
+                        True if (const_operand in served_operands) else False
+                    )  # mem.served_operands = ["W"]
+                    if mem_serve_io_both or mem_serve_weight:
+                        required_IO_data_size = sum(
+                            [
+                                self.each_layer_IO_data_size[f"{curr_id}"][0][operand]
+                                for operand in served_operands
+                                if operand != const_operand
+                            ]
+                        )
+                        required_weight_size = (
+                            self.weight_size_entire_workload
+                            if const_operand in served_operands
+                            else 0
+                        )
+                        required_total_size = (
+                            required_IO_data_size + required_weight_size
+                        )  # required size to put data in current mem level
+                        if (
+                            required_total_size <= avail_mem_size
+                        ):  # sum(layer[operand_size] for operand in mem.operands) <= mem.size
+                            if mem_serve_io_both:
+                                if id == 0:
+                                    self.update_IO_mem_level(
+                                        curr_id, act_operand, curr_mem_level
+                                    )  # update input mem level
+                                self.update_IO_mem_level(
+                                    curr_id, output_operand, curr_mem_level
+                                )  # update output mem level
+                            if (
+                                curr_mem_level < self.mem_update_weight
+                            ) and mem_serve_weight:  # update weight mem level
+                                self.mem_update_weight = curr_mem_level
+        ## [OPTIONAL CHECK] assert check if there is -1 value in mem_update_list
+        ## [NOTE] Until here, if there is still -1 value in mem_update_list, it means the size of top mem level for IO is not big enough.
+        for layer_ele in self.mem_update_list.values():
+            for operand_dict in layer_ele:
+                assert list(operand_dict.values())[0] >= 0
+
+    def update_mem_level_for_loading_data(self):
+        """
+        [OPTIONAL FUNCTION] This is an optional function.
+        Depending on your requirement, sometimes data loading from the top mem level and offloading to the top mem level is a must.
+        If that is the your case, add this function to self.run().
+        Otherwise, if the input is generated on-chip at the lowest possible input mem level and the output is stored on-chip at the lowest possible output mem level, remove this function from self.run().
+        [FUNCTION OBJECT]
+        Update mem_update_list of first and last layer, so that the input data of first layer still is loaded from top input mem level and the output of last layer still is offloaded to top output mem level
+        """
+        self.remove_dummy_nodes_in_workload()  # remove dummy nodes for the ease of telling the branch starting or final nodes
+
+        ## Update mem_update_list and mem_update_weight
+        for id, layer in enumerate(nx.topological_sort(self.workload)):
+            act_operand = layer.memory_operand_links[
+                [
+                    operand
+                    for operand in layer.input_operands
+                    if operand not in layer.constant_operands
+                ][0]
+            ]  # act representation
+            output_operand = layer.output_operand  # output representation
+            curr_id = self.layer_list[
+                layer
+            ]  # current layer id (key) in mem_udpate_list
+            if (
+                id == 0
+            ):  # the first layer: update activation mem level to the top possible mem level
+                for curr_mem_level, mem in reversed(
+                    list(enumerate(self.core_mem_level_list))
+                ):
+                    served_operands = list(
+                        mem.mem_level_of_operands.keys()
+                    )  # Check the served operand of current mem
+                    if act_operand in served_operands:
+                        self.update_IO_mem_level(
+                            curr_id, act_operand, curr_mem_level
+                        )  # update the input mem level of current layer if it is the first layer
+                        break
+            if (
+                id == len(self.layer_list) - 1
+            ):  # the last layer: update output mem level to the top possible mem level
+                for curr_mem_level, mem in reversed(
+                    list(enumerate(self.core_mem_level_list))
+                ):
+                    served_operands = list(
+                        mem.mem_level_of_operands.keys()
+                    )  # Check the served operand of current mem
+                    if output_operand in served_operands:
+                        self.update_IO_mem_level(
+                            curr_id, output_operand, curr_mem_level
+                        )  # update the output mem level of current layer if it is the last layer
+                        break
+
+    def remove_dummy_nodes_in_workload(self):
+        ## Remove dummy nodes (layers) in the graph (assume there is no branch from a non-dummy node to dummy node)
+        ## Redirect the outgoing edges of dummy nodes to non-dummy nodes
+        ## Algorithm:
+        ## for each dummy node, add edges between its predecessor nodes and successor nodes; then remove the dummy node.
+        #############################################
+        ## Comment on the following 4 lines below: visualize the network for debugging
+        ## import matplotlib.pyplot as plt
+        ## pos = nx.spring_layout(self.workload)
+        ## nx.draw(self.workload, pos, with_labels=True, node_color="lightblue", font_weight="bold")
+        ## plt.show()
+        #############################################
+        dummy_nodes = [
+            node for node in self.workload.nodes() if type(node) == DummyNode
+        ]
+        for dummy_node in dummy_nodes:
+            for successor_node in list(self.workload.successors(dummy_node)):
+                for predecessor_node in list(self.workload.predecessors(dummy_node)):
+                    self.workload.add_edge(predecessor_node, successor_node)
+        self.workload.remove_nodes_from(dummy_nodes)
+
+    def update_IO_mem_level(self, layer_id, operand, target_level):
+        """
+        Update self.mem_update_list as:
+        self.mem_update_list[layer_id][operand_index][operand] = target_level
+        """
+        for pos, ele in enumerate(self.mem_update_list[f"{layer_id}"]):
+            if list(ele.keys())[0] == f"{operand}":
+                self.mem_update_list[f"{layer_id}"][pos][f"{operand}"] = target_level
diff --git a/zigzag/classes/stages/__init__.py b/zigzag/classes/stages/__init__.py
index fafd26ea..fdf696a7 100644
--- a/zigzag/classes/stages/__init__.py
+++ b/zigzag/classes/stages/__init__.py
@@ -26,8 +26,8 @@
 from .Stage import Stage, MainStage
 from .TemporalOrderingConversionStage import TemporalOrderingConversionStage
 from .WorkloadStage import WorkloadStage
-from .RemoveNoUseMemStage import RemoveNoUseMemStage
-from .SearchNoUseMemStage import SearchNoUseMemStage
+from .RemoveUnusedMemoryStage import RemoveUnusedMemoryStage
+from .SearchUnusedMemoryStage import SearchUnusedMemoryStage
 
 # Parameter providers: these parameters are provided to substages by the following classes:
 #  - accelerator: AcceleratorParserStage

From ed76389183af7bf3b76dd79df4b59bcbcf6b9795 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Sat, 14 Oct 2023 19:50:36 +0200
Subject: [PATCH 3/7] remove swp file generated by vim

---
 ...cend_like.py.swp => .nfs000000000c8a776f0000304c | Bin
 ...tpu_like.py.swp => .nfs000000000ceb00d10000304b} | Bin
 ...oadStage.py.swp => .nfs000000000ceb00de0000304a} | Bin
 ...oryStage.py.swp => .nfs000000000ceb00e100003049} | Bin
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename .test_ascend_like.py.swp => .nfs000000000c8a776f0000304c (100%)
 rename tests/main/test_without_unused_memory/{.test_tpu_like.py.swp => .nfs000000000ceb00d10000304b} (100%)
 rename zigzag/classes/stages/{.WorkloadStage.py.swp => .nfs000000000ceb00de0000304a} (100%)
 rename zigzag/classes/stages/{.RemoveUnusedMemoryStage.py.swp => .nfs000000000ceb00e100003049} (100%)

diff --git a/.test_ascend_like.py.swp b/.nfs000000000c8a776f0000304c
similarity index 100%
rename from .test_ascend_like.py.swp
rename to .nfs000000000c8a776f0000304c
diff --git a/tests/main/test_without_unused_memory/.test_tpu_like.py.swp b/tests/main/test_without_unused_memory/.nfs000000000ceb00d10000304b
similarity index 100%
rename from tests/main/test_without_unused_memory/.test_tpu_like.py.swp
rename to tests/main/test_without_unused_memory/.nfs000000000ceb00d10000304b
diff --git a/zigzag/classes/stages/.WorkloadStage.py.swp b/zigzag/classes/stages/.nfs000000000ceb00de0000304a
similarity index 100%
rename from zigzag/classes/stages/.WorkloadStage.py.swp
rename to zigzag/classes/stages/.nfs000000000ceb00de0000304a
diff --git a/zigzag/classes/stages/.RemoveUnusedMemoryStage.py.swp b/zigzag/classes/stages/.nfs000000000ceb00e100003049
similarity index 100%
rename from zigzag/classes/stages/.RemoveUnusedMemoryStage.py.swp
rename to zigzag/classes/stages/.nfs000000000ceb00e100003049

From 95c81e20f5bdfbfb6794a63c44d22b796ebdd514 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Fri, 20 Oct 2023 16:50:56 +0200
Subject: [PATCH 4/7] fix the SearchUnusedMemoryStage to better support Adder
 layers; add related pytest under tests/main

---
 .nfs000000000c8a776f0000304c                  | Bin 12288 -> 0 bytes
 test_ascend_like.py                           |  47 ----------
 .../test_ascend_like.py                       |   0
 .../test_edge_tpu_like.py                     |   0
 .../test_meta_prototype_like.py               |   0
 .../test_tesla_npu_like.py                    |   0
 .../test_tpu_like.py                          |   0
 .../.nfs000000000ceb00d10000304b              | Bin 12288 -> 0 bytes
 .../test_ascend_like.py                       |   4 +-
 .../test_edge_tpu_like.py                     |   4 +-
 .../test_meta_prototype_like.py               |   4 +-
 .../test_tesla_npu_like.py                    |   4 +-
 .../test_tpu_like.py                          |   4 +-
 .../stages/.nfs000000000ceb00de0000304a       | Bin 12288 -> 0 bytes
 .../stages/.nfs000000000ceb00e100003049       | Bin 24576 -> 0 bytes
 .../classes/stages/RemoveUnusedMemoryStage.py |   2 -
 .../classes/stages/SearchUnusedMemoryStage.py |  86 ++++++++++++++----
 17 files changed, 76 insertions(+), 79 deletions(-)
 delete mode 100644 .nfs000000000c8a776f0000304c
 delete mode 100644 test_ascend_like.py
 rename tests/main/{ => test_with_unused_memory}/test_ascend_like.py (100%)
 rename tests/main/{ => test_with_unused_memory}/test_edge_tpu_like.py (100%)
 rename tests/main/{ => test_with_unused_memory}/test_meta_prototype_like.py (100%)
 rename tests/main/{ => test_with_unused_memory}/test_tesla_npu_like.py (100%)
 rename tests/main/{ => test_with_unused_memory}/test_tpu_like.py (100%)
 delete mode 100644 tests/main/test_without_unused_memory/.nfs000000000ceb00d10000304b
 delete mode 100644 zigzag/classes/stages/.nfs000000000ceb00de0000304a
 delete mode 100644 zigzag/classes/stages/.nfs000000000ceb00e100003049

diff --git a/.nfs000000000c8a776f0000304c b/.nfs000000000c8a776f0000304c
deleted file mode 100644
index 89ef1eeb6370d6b7c5424922f4d04663ca877488..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeI2ON%2_6vyv~3_fO56h?4$O~6iuva0f~3^XX>!i7s`GYmC1mD5Re*Nb~A-RTxz
z3qf4D@e?@U)}<&oxD;_GxO3wtnE3?$tE(y<XSy>%+&G*H|5PRSo^#Ln-Kqm2XX4&`
z?|tdnuE6n<5P#jfPJcZ8<&|q+y(;$2A}PLp+!2R?GH*sIGo%kYHW`)M2aAXnhm_b8
zTA6=bICP->zzNbR8E2wZ63qR*RQro;s`4<I`4eTxKM7|iYUZq67n2U0oJ{U36Hqes
zqws+2?0Ds#FPH!m*ht{2=yf}%k9xE5x_ss34>r!`K_<Wim;e)C0!)AjFaajO1fDDc
zd3H&Bf|Q=B(sHWvrH9VdM?NtDCcp%k025#WOn?b60Vco%m;e)C0{=q-YKs5-pAq86
zXAwO9|F3`lKX_4y`_NtJ8|Z834zvT^e?f?!p?lCz(5H}r-hh6;D#Yi|5b8rcs0+OY
z{dq-*Z=suz1GS+Rv;$p(E<?XPFT}6VFVOeUU1%SgLwnEwx(t2yoDg3?pF!_HJJ18<
zP~`DLkqaM8fC(@GCcp%k026rf2s9-QWzck)%Sh#v1jllBS7ygWlccS(OsC5hEi(#o
zLbqf&qb=6WC?P%DWK~Y6j4~Z2d8_=&s+X+B&K9c2A6Z%MN=i%-h`DuyX3i*8)7xb@
zm6IZ&dc=|{2q+@0@>E+gMn32cEvG*mgC&EQOdAW#DEH?|Pmh!)KO>E%(HJeJbwjCt
z6z21Ek^75eVE`qjIMv53(VWtrEPG3{u(dga_6^Z|tAZRWePCxwtC(^fo=~e%+cvCK
zCI;CRuOd55`9deMQTDgmVUjI!W7AT_SwzOJL$}{PYjJIq(XG*Ft%tA|F5!~4UZMhN
zuQkzDD^w%rdKaRIzJ2;$T-9k_uuK2>234_Phc7~RZeDB9Hkh7Iy-pE^^`*Jx-bNky
z=oG_6fz*?2O(vnd<27!`R=4kZ!%?T(>)E63z;)feCHtP+?~I3SYu%kw-V#oIOE|SV
zr*>zjNwO?_c1G^l^~Sx?$o6`c><zr0+aGP~E>0(5gl9EB?4Eh_yiU&>c1B}+)Uo8S
z?+u6EU|WwWq9y#TybXr^@nA6U#(jHiNpFZiJr~aEA^@--;;DGIs!%8nr^QGm)4H}u
zJid}tFkkI}#Df|pG?i+S9-_0nyd$^YijsA+GT0XOwpNcY-PZ6Q=qyq1xdEaUe)sA$
ZmQ}@2St!FeOHuRxRjm`EvMwu__#5$kcUu4e

diff --git a/test_ascend_like.py b/test_ascend_like.py
deleted file mode 100644
index 6cd2ddb7..00000000
--- a/test_ascend_like.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import pytest
-
-from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
-
-workloads = (
-    #"zigzag/inputs/examples/workload/alexnet.onnx",
-    #"zigzag/inputs/examples/workload/mobilenetv2.onnx",
-    #"zigzag/inputs/examples/workload/resnet18.onnx",
-    "zigzag.inputs.examples.workload.resnet18",
-)
-
-# Expected energy and latency for each workload defined above
-ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (5649555894.9, 8637780),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1881386179.71, 6486685),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1709089377.83, 3583047),
-    "zigzag.inputs.examples.workload.resnet18": (2408671233.7250004, 4804196),
-}
-
-
-workload = workloads[0]
-accelerator = "zigzag.inputs.examples.hardware.Ascend_like"
-mapping = "zigzag.inputs.examples.mapping.ascend_like"
-(energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
-        workload, accelerator, mapping
-    )
-
-
-#@pytest.fixture
-#def mapping():
-#    return "zigzag.inputs.examples.mapping.ascend_like"
-#
-#
-#@pytest.fixture
-#def accelerator():
-#    return "zigzag.inputs.examples.hardware.Ascend_like"
-#
-#
-#@pytest.mark.parametrize("workload", workloads)
-#def test_api(workload, accelerator, mapping):
-#    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
-#        workload, accelerator, mapping
-#    )
-#    (expected_energy, expected_latency) = ens_lats[workload]
-#    print(energy, latency)
-#    assert energy == pytest.approx(expected_energy)
-#    assert latency == pytest.approx(expected_latency)
diff --git a/tests/main/test_ascend_like.py b/tests/main/test_with_unused_memory/test_ascend_like.py
similarity index 100%
rename from tests/main/test_ascend_like.py
rename to tests/main/test_with_unused_memory/test_ascend_like.py
diff --git a/tests/main/test_edge_tpu_like.py b/tests/main/test_with_unused_memory/test_edge_tpu_like.py
similarity index 100%
rename from tests/main/test_edge_tpu_like.py
rename to tests/main/test_with_unused_memory/test_edge_tpu_like.py
diff --git a/tests/main/test_meta_prototype_like.py b/tests/main/test_with_unused_memory/test_meta_prototype_like.py
similarity index 100%
rename from tests/main/test_meta_prototype_like.py
rename to tests/main/test_with_unused_memory/test_meta_prototype_like.py
diff --git a/tests/main/test_tesla_npu_like.py b/tests/main/test_with_unused_memory/test_tesla_npu_like.py
similarity index 100%
rename from tests/main/test_tesla_npu_like.py
rename to tests/main/test_with_unused_memory/test_tesla_npu_like.py
diff --git a/tests/main/test_tpu_like.py b/tests/main/test_with_unused_memory/test_tpu_like.py
similarity index 100%
rename from tests/main/test_tpu_like.py
rename to tests/main/test_with_unused_memory/test_tpu_like.py
diff --git a/tests/main/test_without_unused_memory/.nfs000000000ceb00d10000304b b/tests/main/test_without_unused_memory/.nfs000000000ceb00d10000304b
deleted file mode 100644
index 6d6f0b6540d5d109039abd523870bb3375ea1dc0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeI2%ZemL6ozA5&P^FXS4vn7Ql;6Em#(VL9B9ymi-8ebIBrCesLWH<Ih{8mBD<@5
zE`m?s#+@sjN0<kg!G#EdOK*?hO3{ToPu{9adq#`dNuPk9%*xF3N1QL?6clx4&IdQ%
z<-Twjju#mF^Sd|TXZFkGmp*-&-PToJU;eL=C6UzIsVp_9Lsx*7MjTWrRELm@8LYzZ
z*M>Igc4S9I4BDP4nMeEKyij3P#?mDDBAiJL;YqSMkqdk6yng7wHlU5RGnwSgX?T>F
z{h~5ql~)?#FoUd6$L%dsR$-bPfGCewE1(l1a83zSX1?uTVLP7Nl={}-HU8qYTjvx8
zrA-8g01+SpM1Tko0U|&I&L;s=USJ<%>d$rQzup}${NvbtqyrHk0z`la5CI}U1c(3;
zAOb{y2oM1x@INFVW5(XT$k-pK`v3p__y2FtGxjU$0qRTCXDEppp?<&0*jK2>s7I(z
zP!(zm_3ahL?xI|jjS{GB)GMf~s2?vg_5<pB)YqtosQajUsJp0}s8><nTw?4C)aR%W
z^(Lxb<4^TEX%GP-Km>>Y5g-CYfCvzQQv&>SNR2J2hNsd%9v$<&JzgGT3u*npQ*Id{
z%Tg7~AuLOX4EB_E!6UX71v#k2so&Y4_JKC$)T+H|@iX3hk3+8Osq}|O*yoxSGXAKU
zu(hbC?yp;1MiHc-q$!leGh8pW%{C7Y`w6-=TRh4@kI;PqCft`QK9UN;5)?MmX4rb$
zH`+eAk9NpnnDe^jHk4&D{PzWnuCuoqxJG1B9f(pYnSoKs2@D5)=)hW~p-1?}>bw+~
zO67dew)~1DFDs)3EM-=vphdrG@&0?antt9vv#K=)!?mgB$r9b_mg5bqJ9MJ&6mFfn
zt50^l*Z4kdt$mt8fzH9W{@^+vdR{Oc2kzK$1s*Iu4xGU8c1PB_mEERo_i5X`mEBv3
zBF~q#pF5od-ozVwo^aeL`Z<o{`N5`sSusme+$ZMHTUiDZZ|eK*cp@BU8sM<F=lZ+Y
zD>~lPR;I8-)0O39ygQlh1Uvp1v-TVZ6ZeDNAec<ivES>sG4HS~{!UkFSOZO2$vj?_
zEN)Z|GTLA58;*N2$sy))RvhAsZVO~<^H=gr+&2t519!u?ztOW;|ME2g8ZtIlMaH}G
Wz(V9nRu=fJ_DnU+SQp*aD*Fp%Z6iwn

diff --git a/tests/main/test_without_unused_memory/test_ascend_like.py b/tests/main/test_without_unused_memory/test_ascend_like.py
index cedd830e..4eee129a 100644
--- a/tests/main/test_without_unused_memory/test_ascend_like.py
+++ b/tests/main/test_without_unused_memory/test_ascend_like.py
@@ -6,7 +6,7 @@
     "zigzag/inputs/examples/workload/alexnet.onnx",
     "zigzag/inputs/examples/workload/mobilenetv2.onnx",
     "zigzag/inputs/examples/workload/resnet18.onnx",
-    #"zigzag.inputs.examples.workload.resnet18",
+    "zigzag.inputs.examples.workload.resnet18",
 )
 
 # Expected energy and latency for each workload defined above
@@ -14,7 +14,7 @@
     "zigzag/inputs/examples/workload/alexnet.onnx": (5649555894.9, 8637780),
     "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1881386179.71, 6486685),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1709089377.83, 3583047),
-    "zigzag.inputs.examples.workload.resnet18": (2408671233.7250004, 4804196),
+    "zigzag.inputs.examples.workload.resnet18": (2243493483.15, 4657130),
 }
 
 
diff --git a/tests/main/test_without_unused_memory/test_edge_tpu_like.py b/tests/main/test_without_unused_memory/test_edge_tpu_like.py
index 59bdaa34..4b06d5de 100644
--- a/tests/main/test_without_unused_memory/test_edge_tpu_like.py
+++ b/tests/main/test_without_unused_memory/test_edge_tpu_like.py
@@ -6,7 +6,7 @@
     "zigzag/inputs/examples/workload/alexnet.onnx",
     "zigzag/inputs/examples/workload/mobilenetv2.onnx",
     "zigzag/inputs/examples/workload/resnet18.onnx",
-    #"zigzag.inputs.examples.workload.resnet18",
+    "zigzag.inputs.examples.workload.resnet18",
 )
 
 # Expected energy and latency for each workload defined above
@@ -14,7 +14,7 @@
     "zigzag/inputs/examples/workload/alexnet.onnx": (5568602396.684999, 8134431),
     "zigzag/inputs/examples/workload/mobilenetv2.onnx": (751128562.4699999, 2427487),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1784539639.4349997, 3176546),
-    "zigzag.inputs.examples.workload.resnet18": (2413350265.7900004, 4314851),
+    "zigzag.inputs.examples.workload.resnet18": (2115122870.395, 3884789),
 }
 
 
diff --git a/tests/main/test_without_unused_memory/test_meta_prototype_like.py b/tests/main/test_without_unused_memory/test_meta_prototype_like.py
index 49d7491b..e2594f42 100644
--- a/tests/main/test_without_unused_memory/test_meta_prototype_like.py
+++ b/tests/main/test_without_unused_memory/test_meta_prototype_like.py
@@ -6,7 +6,7 @@
     "zigzag/inputs/examples/workload/alexnet.onnx",
     "zigzag/inputs/examples/workload/mobilenetv2.onnx",
     "zigzag/inputs/examples/workload/resnet18.onnx",
-    #"zigzag.inputs.examples.workload.resnet18",
+    "zigzag.inputs.examples.workload.resnet18",
 )
 
 # Expected energy and latency for each workload defined above
@@ -14,7 +14,7 @@
     "zigzag/inputs/examples/workload/alexnet.onnx": (5679695605.4400015, 8299150),
     "zigzag/inputs/examples/workload/mobilenetv2.onnx": (901092009.6000001, 2610609),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1730672410.3200004, 3262009),
-    "zigzag.inputs.examples.workload.resnet18": (2419893343.4549994, 4176163),
+    "zigzag.inputs.examples.workload.resnet18": (2265438430.2299995, 4017227),
 }
 
 
diff --git a/tests/main/test_without_unused_memory/test_tesla_npu_like.py b/tests/main/test_without_unused_memory/test_tesla_npu_like.py
index 37080e6e..25eb9648 100644
--- a/tests/main/test_without_unused_memory/test_tesla_npu_like.py
+++ b/tests/main/test_without_unused_memory/test_tesla_npu_like.py
@@ -6,7 +6,7 @@
     "zigzag/inputs/examples/workload/alexnet.onnx",
     "zigzag/inputs/examples/workload/mobilenetv2.onnx",
     "zigzag/inputs/examples/workload/resnet18.onnx",
-    #"zigzag.inputs.examples.workload.resnet18",
+    "zigzag.inputs.examples.workload.resnet18",
 )
 
 # Expected energy and latency for each workload defined above
@@ -14,7 +14,7 @@
     "zigzag/inputs/examples/workload/alexnet.onnx": (6040086796.366001, 8389669),
     "zigzag/inputs/examples/workload/mobilenetv2.onnx": (930702060.6110002, 1965457),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681.4799998, 3257898),
-    "zigzag.inputs.examples.workload.resnet18": (2375316568.8910007, 4082454),
+    "zigzag.inputs.examples.workload.resnet18": (2220861655.6660004, 3934616),
 }
 
 
diff --git a/tests/main/test_without_unused_memory/test_tpu_like.py b/tests/main/test_without_unused_memory/test_tpu_like.py
index 3386b816..28df3fa1 100644
--- a/tests/main/test_without_unused_memory/test_tpu_like.py
+++ b/tests/main/test_without_unused_memory/test_tpu_like.py
@@ -6,7 +6,7 @@
     "zigzag/inputs/examples/workload/alexnet.onnx",
     "zigzag/inputs/examples/workload/mobilenetv2.onnx",
     "zigzag/inputs/examples/workload/resnet18.onnx",
-    #"zigzag.inputs.examples.workload.resnet18",
+    "zigzag.inputs.examples.workload.resnet18",
 )
 
 # Expected energy and latency for each workload defined above
@@ -14,7 +14,7 @@
     "zigzag/inputs/examples/workload/alexnet.onnx": (5475639384.492001, 8979956),
     "zigzag/inputs/examples/workload/mobilenetv2.onnx": (952688145.0069999, 21873214),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1659252422.016, 4000289),
-    "zigzag.inputs.examples.workload.resnet18": (2296491401.491, 4909027),
+    "zigzag.inputs.examples.workload.resnet18": (1982830786.5119998, 4509235),
 }
 
 
diff --git a/zigzag/classes/stages/.nfs000000000ceb00de0000304a b/zigzag/classes/stages/.nfs000000000ceb00de0000304a
deleted file mode 100644
index 7cee7c54ec6e1f1a31790cc34a9bc13dfcfe955f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeI2zmF6*6vqt|6x@+0k&@QzHnXS2dzUl_>8hkT@k2-?5Cxqq?~G^8+{}2eJ;&`u
zftC`{P|{JQNtK?C0{#H#Q$Uq2>GJHEo!!~vl=_r6(pMgP{OtEW@9j1Brki(mZqO@i
zlfd&iA%DNxkbjboYhOM6n%uW_>6icODO4=%<y=%&nu9GSt#E9=&SiZdOE#6A`6GYe
z1av>1#5$37GBu)%_jsxeud75jRc3rDtmH>3I}+Js>D_PIpS0_7F02K6>qG{_UHFu1
zk=*uER?WH_@dXJWfk2>k>E$cy<lAqy!cToTxJ1AF;zxvo1dsp{Kmter2_OL^fCP}h
z|A>IA&XC6t@7XqH+Ky*d#&(JW5<mh-00|%gB!C2v01`j~NB{{S0VHq=35bM{a~BAC
z0rLO<r)>-$J|pBk=pE=ckOdjg1JHHQHPCs`$BTq~0KEad1SKE_S_iFx{#hsFchD2i
zPoPcE8tAVzLSBRZ06ho&3_1&X+xqeh#=`rA4-!BENB{{S0VIF~kN^@mtpw=GGgC58
zXk5sZ^^>Qa#h6C9m`O9H^3WN<RhjA$SvG`zS%38M2C7PG>nRUyv&__GG`iZe=|0kJ
zYGt0XTv^9;%43n|Vwy{PZ*w;~;mg+5-7($R*ncQYX8XS6I?JSC-pME(+%h_r)<Ux{
z4KdJP!oW*W$X^*_gTAp3YOw)4Q%TQhnOoTp<CsM-Xa4-2rAnKF!<YUMO%(SA!E&$*
zw}!W#d9!DZs+qA^SF`AN0I@EiySau;52@W(6?J=(eqR^G?56L8?wt#6xmhKf1dr(U
z_G070NuOSoj9Y?`SXtJEG{VWKJY-H+8lXdCcFwIfuF2WMwoiK0rah4<^;O^#3QL_6
zA;I&_5a0ZDv7xS{iAIb=-6_X;bgWKRE5TpY?e^z`=c}~^HskKDr30Fvhja(-QX*GJ
z&?v4;s?(-M=iag4-f&2N0JNb)oC0<0OdUII2x*q?F87K@p<^6E2dnRVPdTJTII3I#
z>1RgQ*<MIi>GR}&L4sxqI@>!>X=X&V#|UYv+?NWJJ5)Ezq`Mv}KjNXdIUm8FuGHv)
zHjV78ZDeO%42drZ?(d?is33ET+<(%r>40Y;j=4pc5YpPW&djH!bpEnKDl9Dz$-m8=
BzaRhr

diff --git a/zigzag/classes/stages/.nfs000000000ceb00e100003049 b/zigzag/classes/stages/.nfs000000000ceb00e100003049
deleted file mode 100644
index c41e8a561c8d804559f0ef78a09d8dcb5d689970..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24576
zcmeHPYm6mD6>gM=JXUrgL4RqnBXI9B)3eJXkcmTpg@p~Q7#0@eIJP~v>&|VaZ}+7i
zGdl|lCPXwLJTxH&iGPd&CMuDLMgx%u2`0E|WQnW)1Y<N9qy7MlnE0J~bXRrH3^R)o
zh3@2=d%GT|PMtbcb?Vf)Gq-);?mg^cce{gUv*X<J^fvyo^U5h_-2VaR=B$V*`OThT
z*z>cA$RA}qUEJaF%+KA0BI3m&kKH+5R==GHP69XgCVNT1v&p&Ck9z~JpQK(f3j92b
z=e;>U<KBUA{(wI}X?&+b3zI6n7x`I+`dRMJ<KuchOcwc#agp(055A|%dnv;mEthS#
zUs?&gt0hq6{fUb=Ip?3ZLrnFtt!J{+PrLD5Et0nLtpr*Lv=V3~&`O|{Kr4Y(0<8qz
zZ4$^w$2xbQ)9+Cozes%^Q~Nwly`QB%FRtYutKP3tpEI@m_o{bI|7*2;_1b>45@;pR
zN}!cMD}h!5tpr*Lv=V3~&`O|{Kr4Y(0{;aG_<`d*f4t*7a{`Ft{$HQZe{ibf+y-m|
zjsaeHzvFx#cmjAB_y+J<;0|Cna60hZCdc^_a0hTca4hi0Qyk|zz-NH7ffrA9oQHsG
zfGxnQCppfOz?Xr200TY@d<b~`M9299a0GY?cpUgDa4%2*p9FRR7XTB$YwvTMUjk16
z_X9rQ0u#U{;7{ly_48I>FK`xc0`O<_|0UpY;C|pf;5y(nI6V#nPXTuU^S}gf0`OZn
zJRSuK;0j<Tuo-v}j*e%5M}V7wbAZ!;-^0Q32(Sd~0=57z!pZUl<!E8XGsf}(XT2mw
zctf1CWW*6B5o9b1vz&z)<HJ$D%n%R3vZ6O&e#W+576d%q$|8T6r<r5WVq+{D@m}ah
z3_%*4Ee&|g0^Sc}#9FW=#8gCyAF!bx7k(5iPnC6f6!uxf<4(q-zAFg3nk7%O)MeAt
zY<pFS8lGinbBTxZgPf&&l=6(nIYK*<n1wMLitvoG6{<kbT8+$XpLIkb^+#}_Szyy<
zT}fT%c~KZIWHVM}!JXgR0Bf`L_BAh%kCFq66AVmja?%i$#f2t8W2kA^^}~2n<hmpF
zi0aC!VuP^VVf0m#*@&#xJNP@MFlW`Hq(2CuuW4_vjP`m-%H4W;SwZxGG{cWPKTZ9z
zMkAxFLIgoK=KJ$bnDbPsr{C-G2(9Kx3MGZgb-QH^G2%&)L!Rr?M?)87H7&oAq@BDG
zvG$5I^}>Lv<Y`B})p(^;Oc^<u847n6_;OcG9<K;Se$JuVSzhwP@Kjuv$wp)492ONT
z4H+9_GI$CqjOZyfJu&YWdBh;qBn@bt$P*^wv$%(`6eJi{Bq}ZAwzwxJyl5J$3!CRL
z4F~s(k@1$oeBh-Nt3`2Mg$UOof{51=!`L-+iMq5L@+e@vAs=TL?$r0fxSy20qq$Sd
z6LYYb^F&yfn-*~gB#9C|m6Yji+cqJvg(W|o&&C}}(vMi{rYm{cxkiVkhe;6jLmrsD
zv`b4W7%MO7dp)eM{#?YfnH^KJRk=)aTq;td{Px+7quFaTsxwAu80T_6GzO;X{K}<B
zon#prJ(3)8-enV)u&d-yCaEzxqFaqdz<(pHWV>EPY-UykOQogYXgibYHZ7-vt@l+S
zG%beBZq67h6foYZVg)wWPS^FITVd{b9b&>7t8u7`)DkHlL!)#?HI#w}!DE%y5Xp*J
z)NYj_=v1!;=T+ji>T8qfs)HCtS&#;{?4~0kDAn9=1WtBD_#{n7XY6Pm)ME4miP#za
zAdnV~TG${F%v7u?WaemfX0&BsMepTB$|V8pK`81{R4L||-X~#k2m@$}3eh4U$qCgH
z>sHpuk`#lO!X%nv{WKY3{Tk64HkagiGDN>n^E_9td~zY8`dElG>tdPWZ04Ms&Y4w9
zZ;g?0(c*}j!n{s;CC_Wr(W@qHiJn|!FJU>BAf9{DtSK`DlO)8Flk1LxlU3`i#fjS9
z)?j7#QLwUVowZmY>&50{og*%Xng$(XFGqZb%l6QMy(}~a`)?t1O$vJDLC`Cj+;i0)
zN6M{=TkMVN)Q0@99p+LIsV{0L70VrH?Z(0b^E^BtY-Hl5@vx6)ya<vB3Yun(TqKb0
z-7vt>K<s=<Win=Iw%XMDNr9bvZkg@nG~*g`->uOa7C3awaW+bm9!JnM>heJd5;v){
zgIFu5R`X$3@T`}HBVqhEv%$9$CG+z<rR5u6U>3L#)wLq6<9V?^<et}coN!3<OqX#S
zIjNUNWSgkGa+{KwGEm%Af#R-^OXdn*bq(=il`iry%5?T9>@7sx3pgM35=4@VHfv-D
zej31n;I6rNHSAqYagSJGuGZ%91}F-vS9NqJ?Ar}gHTDk8$7PkZ?9loDFwVL^!#UVI
z{~yAc|88Ixa3OFKa0qAo2Z4RS7T_hE?;isOz~#Ue;0Vt6-v(|6E(0b37uXE^6X*RS
zz_Y*?f&G9FYzO{@^ZpUw=fEN0QQ%HsFK_{{3HUqC_`d>v3OolK1a1S)1Q>7(-~fNY
zS^rzWUBDH<8NeyP$pH297u47GX(iA~pp`%?fmQ<V5DCy}fu6-MgEKcgAiOm=(|Dyv
z&YMsD(V!FaCFOUivbvQYuW}Zwhgx+EH7z8~gD`4c(d?>rQRUkhWBU~E(qT--e!%An
zVH!Hb4V_|%w(Oqvzrd`fewH@<$~~06pi5y3<{>;s`5=MuLqzFGIS!sN9E(e)H>b+i
zj<Tk~e6;LTw51X3l6$Mu-+Ckc5>mM0tTVAV>k7vJyR~wB9CTfG>#Ac%cB7Z^e44sN
zFEv*e8Q3C^zElHQ69TMr!(^;Rt0am7x9XRJju0pQPzupSuJ|~YZJp5GzX{=PoWQtk
zt;)-?qHl)P>Qd#h&=}j9auN^u4$y+-)hSEa8Vqsz{U}+=L|4nVs=r`X9sjHOAj&Ge
zD(8<~rQxNXZlu^1!az_BgDP2z4MHNr-WG1jDspSbkxrRc^_6G|-*Om~k|>LT1Sx=9
zCX_`)6_IDuYsi@jR2oYXqo-<IzRSPLV-ti!9%pdxOTjl%D=C<jE02m0R`t4aape}0
zP3V`Ei>lEK#l2cjDt;}kS&@WVR|6LwE$z5=Oy1S43Dwc8blD~K8#2mn2xEmIZj`a5
zO=K$RpnhJAA}*G`D!ExjGiFuLiFR@7n%#=)k48KWI#mNR@CESwaG#bvo^29oltJUV
zNi2IH(6R4Snj}n_YA)8Q(-m$daRq|mQ_i-JLP*lv*6`6-<47I1iIWddb8HpGkXMtI
z>e2Id3C+3+U7yTqRp|Ub3HxWB?xAu2f0eR@Vx0G1!x{faz&*gHfR6#E0YAn$|4!f*
z;1j?U@H)=-&jMco3ZM)81Lyifz=Oc2fy;rv;T-=x-~k{2dcb+WUvX}yJN^{79=H@Z
z33w7`_6LE_12+I41fIwF{UC4=o#TPW;J>&VD1dq3TwojU2JY$U{{A80THqSs4cyUx
z9SDI70SEXY?&t3U_5jxcCjzhFe*Sr25x5dK4tN?eB02TdoxKv(dOoCD$iJj9)+m^|
z^(JcDa1xt@>HFKIdB(N!`t)wq3M39Aa|6P#ysy<oUsDWNZD#Bk0&_K4Ik&bkpiq4W
zu9fK8yJKVV28BkNx5#_-@B;eJ7Ub&`p%>!jquNKQJT{bRw!@)b>Ssl|Xisq{E*kXh
zg-y)SB<QnR)a!Mt953~ktV()OL>0OJSdHpjTSsb=tG8CmE}IiKzI1_(0&+nYcQ9o6
z%exe;(L64O;^w)dsFb!_b>|{3=!|>3F^2FNDXfrZ;?6)Bmy!l6X)wCNR?#`Qq(>Nu
zpXX^fSL8gahM$zH-a7iY$9DgAglx>{hjB4`hXY|XR}vHCWFkpQeIsj{xWP6xA^rSh
zO+yxyL9L6Hict2WZc~?c5S3kB?|RieQk7M&ON8}~y84#Ghzp>4Jj%E|Qz?}mdw$9^
z`PWz+95Y>iUpFsw<1(XyB4%esNj6&rLTF-S!J$|O;Uq4?F0xLyiwR0$P6iux4@fOR
z3_%JJr`4-c!MZRk1`GwH#>d#z9HA{sIQaPsocRoT)mxat2aY=;1_L^o;M9|%4T@z!
z*q617IJygx+W$ZZ8Oib}0?3H}m9@b($s8(ZKC75^m^5n+#0r|Qx)#_h*or{BtjxHf
zJC4aQeOFvto7sPqWux(m-?y%`V{8{)16J(ge>&l0HK`;-Yk*jl?S<p0q66h3q7N9B
z1g$Bc#U{KY+TEnKAT0VT2LlfM5fh#(>07CVEaC!LU-GH#v=&5WaYN&2h}+6D(l^Sd
z-Tx_y9RmqFbqzDZE+}F)ZTn&EK@oDXx<cLDFZ@yaX~7e-OUhgf2ZC1hSm8FckD_N<
z%cn)lW^PIbxx!wv3Q>df*fLZj>xeAl51JeyY$!=a9xdUteuK-DNAMd3BE6aiR87Rz
z8J*A?Q<_R}rsm5spu|Op$`HGPO@XMG^qX!3MbplpNoXG)O};V2(xNh0Q~7#pa;Rbb
zj8S~cfbA9s-<#k&lrh91!eDS~ueS>;qL>(<jiMi>xaDsS^w9?o*SGn#RZJQij_!&6
zG&*j_!_Mrm*V(tU3fM|zv?;mLs;m|eBfrMe_SsqN;-FSrAxQaF=%5T<l6$$<4%+^g
z!YHaJFvJ$g4L_qHOky%Iu>Fs-ER1`w2;yj&hA@Pq9Bxje9zY3%eo&+a4R6YCDMC^J
zupo00hS)1G7vPw4#r^-U>3of|Hl6>^Qm5H(;{1OAxE}Zja6Iq|&i=mxz6;z3+zXIT
z;8mReUjUv2J_q!Hao~91H}C^I0_4C>;Bw$p;4pjvKLH*FLf~wGeh=_6d;t#wi@=q@
zX5cV*6a1~`q}`TjCD2Nsl|U<jRsyXAS_%ArNI+Y>^x6LWkAC*3&2BorIi`5tlC3X}
zA?r5LsoD1Df2*AAr60lgQII+=l}=`yUqw_wJ-r-Faq2N0;JQX5qpU&%QL6p<AAHoM
nP@tuT52j3RfBt8>X{!4a64yKE=YROkD`tdnOKkX8mtE)ITn~iB

diff --git a/zigzag/classes/stages/RemoveUnusedMemoryStage.py b/zigzag/classes/stages/RemoveUnusedMemoryStage.py
index c6d73541..58cc4762 100644
--- a/zigzag/classes/stages/RemoveUnusedMemoryStage.py
+++ b/zigzag/classes/stages/RemoveUnusedMemoryStage.py
@@ -51,8 +51,6 @@ def __init__(
         self.mem_update_weight = mem_update_weight
 
     def run(self) -> Generator:
-        print(self.mem_update_list)
-        print(self.mem_update_weight)
         modified_accelerator = self.generate_accelerator_with_removing_unused_memory()
         sub_stage = self.list_of_callables[0](
             self.list_of_callables[1:],
diff --git a/zigzag/classes/stages/SearchUnusedMemoryStage.py b/zigzag/classes/stages/SearchUnusedMemoryStage.py
index 1be0f79d..53229eb5 100644
--- a/zigzag/classes/stages/SearchUnusedMemoryStage.py
+++ b/zigzag/classes/stages/SearchUnusedMemoryStage.py
@@ -46,6 +46,29 @@
 ##           if ("W" in mem.operand) and (current_mem_level < mem_update_weight):
 ##             mem_update_weight = current_mem_level
 #####################################################
+#  Special note for Adder layers:
+#   Currently the algorithm is tricky for Adder layers. As for a conv/pool layer, required I, O sizes are put in
+#   each_layer_IO_data_size and the weight data size will be accumulated in weight_size_entire_workload.
+#   But for Adder layers, (1) there is no weight operand (or constant operand); (2) there are two input operands.
+#   (3) the info regarding which of the two operands is represented as I1 or I2 is not saved in self.workload,
+#   though it is defined in the input file.
+#   So, the current solution is:
+#   (1) for weight, the data amount is 0, which means weight_size_entire_workload will not consider Adder layers.
+#   (2) for act, we add up the data size of the two (or multiple) inputs and treat the sum as the act data size
+#   for the current layer, which is stored in each_layer_IO_data_size.
+#   What does this mean?
+#   This means for Adder layers, the required act data size is over-estimated, because we also include the data amount
+#   of the other operand, which we may have defined separate mem for the other operand.
+#   In other words, for a mem level with enough size to hold both O, I1
+#   (assume I1 is the mem representation for one input),
+#   may be thought by the code that the size is not enough and therefore the output cannot be stored at this level.
+#   But keep in mind that!!!!!:
+#   this is only a problem when you use manually-defined workload and there are Adder layers.
+#   there is no problem if your workload is an .onnx file, because Adder layers will be skipped by default.
+#   Is there a solution?
+#   The reason why it cannot be fixed is we do not know which operand is from which layer.
+#   This problem can be fixed unless this info granularity is saved in the self.workload object,
+#   which is a networkx graph.
 
 
 class SearchUnusedMemoryStage(Stage):
@@ -240,6 +263,8 @@ def update_top_mem_level(self):
                 self.update_IO_mem_level(
                     curr_id, act_operand, prev_layer_output_level
                 )  # update the input mem level of current layer
+            if id == 28:
+                pass
             if (
                 branch_starting_node or branch_final_node
             ):  ## branch starting node or branch final node or permited dummy nodes (e.g. Adder layer)
@@ -296,33 +321,53 @@ def update_top_mem_level(self):
                     avail_mem_size = (
                         mem.memory_instance.size
                     )  # available hardware mem size
-                    if (
-                        len(layer.constant_operands) == 0
-                    ):  # Adder layer: multiple act operands
-                        mem_serve_act = False
-                        for layer_act_operand in layer.input_operands:
-                            if (
-                                layer.memory_operand_links[layer_act_operand]
-                                in served_operands
-                            ):
-                                mem_serve_act = True
-                                # modify to match the keys used in each_layer_IO_data_size
-                                served_operands = [
-                                    output_operand,
-                                    layer.memory_operand_links[layer.input_operands[0]],
-                                ]
-                    else:
-                        mem_serve_act = (
-                            True if (act_operand in served_operands) else False
+
+                    try:
+                        # we need to grab the next layer name, which is a non-Adder layer for sure
+                        # if next layer is an Adder layer, then branch_final_node=True for the current layer,
+                        # so, the simulation will not reach to this "else" branch.
+                        next_layer = list(self.workload.successors(layer))[0]
+                        # next, we find out the layer representation for the act operand of the next layer
+                        const_layer_operand_of_next_layer = next_layer.constant_operands[0]
+                        act_layer_operand_of_next_layer = \
+                        [operand for operand in next_layer.input_operands if operand != const_layer_operand_of_next_layer][
+                            0]
+                        # then, we will fetch the mem representation for the act operand of the next layer
+                        act_mem_operand_of_next_layer = next_layer.memory_operand_links[act_layer_operand_of_next_layer]
+                        # check if the current mem level serve the act operand in the next layer
+                        mem_serve_act_in_next_layer = (
+                            True if (act_mem_operand_of_next_layer in served_operands) else False
                         )
+                    except IndexError:  # there is no next layer, which means the current layer is the last layer
+                        # As for the last layer, we will instead check
+                        # if the mem serves act operand of the current layer.
+                        mem_serve_act_in_next_layer = (
+                                    True if (act_operand in served_operands) else False
+                                )
+
                     mem_serve_io_both = (
                         True
-                        if mem_serve_act and (output_operand in served_operands)
+                        if mem_serve_act_in_next_layer and (output_operand in served_operands)
                         else False
                     )  # ["I", "O"] both in mem.served_operands
                     mem_serve_weight = (
                         True if (const_operand in served_operands) else False
                     )  # mem.served_operands = ["W"]
+
+                    # we need to change served_operands if the current layer is an Adder layer,
+                    # for the ease of calculation of required input data size.
+                    # Since an Adder layer has two inputs,
+                    # but in each_layer_IO_data_size, data size of two inputs are put under one key,
+                    # so we have to update served_operands to ensure the key used in each_layer_IO_data_size is in it.
+                    if (
+                            len(layer.constant_operands) == 0 and mem_serve_io_both
+                    ):  # the layer type is an Adder layer, which has multiple input operands
+                        served_operands = [
+                                        output_operand,
+                                        layer.memory_operand_links[layer.input_operands[0]],
+                                    ]
+
+
                     if mem_serve_io_both or mem_serve_weight:
                         required_IO_data_size = sum(
                             [
@@ -358,7 +403,8 @@ def update_top_mem_level(self):
         ## [NOTE] Until here, if there is still -1 value in mem_update_list, it means the size of top mem level for IO is not big enough.
         for layer_ele in self.mem_update_list.values():
             for operand_dict in layer_ele:
-                assert list(operand_dict.values())[0] >= 0
+                assert list(operand_dict.values())[0] >= 0, \
+                    "SearchUnusedMemoryStage fisnishes abnormally, there are still layers with top mem levels not figured out."
 
     def update_mem_level_for_loading_data(self):
         """

From 066dc4617da40076b30d2a9f4f4c391b6d81df98 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Fri, 20 Oct 2023 22:01:50 +0200
Subject: [PATCH 5/7] reformat code in black style

---
 .../classes/stages/RemoveUnusedMemoryStage.py |  8 ++--
 .../classes/stages/SearchUnusedMemoryStage.py | 45 ++++++++++++-------
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/zigzag/classes/stages/RemoveUnusedMemoryStage.py b/zigzag/classes/stages/RemoveUnusedMemoryStage.py
index 58cc4762..f0d73eac 100644
--- a/zigzag/classes/stages/RemoveUnusedMemoryStage.py
+++ b/zigzag/classes/stages/RemoveUnusedMemoryStage.py
@@ -89,7 +89,9 @@ def generate_accelerator_with_removing_unused_memory(self):
             # special case when defining workload manually:
             # the constant operands list is empty for such as "Adder" layers
             # for input operand, we will represent all inputs as one input, since only their data size is used for required mem size calculation.
-            act_operand = self.layer.memory_operand_links[self.layer.input_operands[0]] # act representation in memory
+            act_operand = self.layer.memory_operand_links[
+                self.layer.input_operands[0]
+            ]  # act representation in memory
             const_operand = self.layer.memory_operand_links[
                 self.layer.input_operands[1]
             ]  # weight representation in memory
@@ -118,9 +120,7 @@ def generate_accelerator_with_removing_unused_memory(self):
         # Find target_act/const/output_mem_level
         for pos, ele in enumerate(self.mem_update_list[curr_id]):
             if list(ele.keys())[0] == act_operand:
-                target_act_mem_level = self.mem_update_list[curr_id][pos][
-                    act_operand
-                ]
+                target_act_mem_level = self.mem_update_list[curr_id][pos][act_operand]
             if list(ele.keys())[0] == output_operand:
                 target_output_mem_level = self.mem_update_list[curr_id][pos][
                     output_operand
diff --git a/zigzag/classes/stages/SearchUnusedMemoryStage.py b/zigzag/classes/stages/SearchUnusedMemoryStage.py
index 53229eb5..190a2575 100644
--- a/zigzag/classes/stages/SearchUnusedMemoryStage.py
+++ b/zigzag/classes/stages/SearchUnusedMemoryStage.py
@@ -328,26 +328,37 @@ def update_top_mem_level(self):
                         # so, the simulation will not reach to this "else" branch.
                         next_layer = list(self.workload.successors(layer))[0]
                         # next, we find out the layer representation for the act operand of the next layer
-                        const_layer_operand_of_next_layer = next_layer.constant_operands[0]
-                        act_layer_operand_of_next_layer = \
-                        [operand for operand in next_layer.input_operands if operand != const_layer_operand_of_next_layer][
-                            0]
+                        const_layer_operand_of_next_layer = (
+                            next_layer.constant_operands[0]
+                        )
+                        act_layer_operand_of_next_layer = [
+                            operand
+                            for operand in next_layer.input_operands
+                            if operand != const_layer_operand_of_next_layer
+                        ][0]
                         # then, we will fetch the mem representation for the act operand of the next layer
-                        act_mem_operand_of_next_layer = next_layer.memory_operand_links[act_layer_operand_of_next_layer]
+                        act_mem_operand_of_next_layer = next_layer.memory_operand_links[
+                            act_layer_operand_of_next_layer
+                        ]
                         # check if the current mem level serve the act operand in the next layer
                         mem_serve_act_in_next_layer = (
-                            True if (act_mem_operand_of_next_layer in served_operands) else False
+                            True
+                            if (act_mem_operand_of_next_layer in served_operands)
+                            else False
                         )
-                    except IndexError:  # there is no next layer, which means the current layer is the last layer
+                    except (
+                        IndexError
+                    ):  # there is no next layer, which means the current layer is the last layer
                         # As for the last layer, we will instead check
                         # if the mem serves act operand of the current layer.
                         mem_serve_act_in_next_layer = (
-                                    True if (act_operand in served_operands) else False
-                                )
+                            True if (act_operand in served_operands) else False
+                        )
 
                     mem_serve_io_both = (
                         True
-                        if mem_serve_act_in_next_layer and (output_operand in served_operands)
+                        if mem_serve_act_in_next_layer
+                        and (output_operand in served_operands)
                         else False
                     )  # ["I", "O"] both in mem.served_operands
                     mem_serve_weight = (
@@ -360,13 +371,12 @@ def update_top_mem_level(self):
                     # but in each_layer_IO_data_size, data size of two inputs are put under one key,
                     # so we have to update served_operands to ensure the key used in each_layer_IO_data_size is in it.
                     if (
-                            len(layer.constant_operands) == 0 and mem_serve_io_both
+                        len(layer.constant_operands) == 0 and mem_serve_io_both
                     ):  # the layer type is an Adder layer, which has multiple input operands
                         served_operands = [
-                                        output_operand,
-                                        layer.memory_operand_links[layer.input_operands[0]],
-                                    ]
-
+                            output_operand,
+                            layer.memory_operand_links[layer.input_operands[0]],
+                        ]
 
                     if mem_serve_io_both or mem_serve_weight:
                         required_IO_data_size = sum(
@@ -403,8 +413,9 @@ def update_top_mem_level(self):
         ## [NOTE] Until here, if there is still -1 value in mem_update_list, it means the size of top mem level for IO is not big enough.
         for layer_ele in self.mem_update_list.values():
             for operand_dict in layer_ele:
-                assert list(operand_dict.values())[0] >= 0, \
-                    "SearchUnusedMemoryStage fisnishes abnormally, there are still layers with top mem levels not figured out."
+                assert (
+                    list(operand_dict.values())[0] >= 0
+                ), "SearchUnusedMemoryStage fisnishes abnormally, there are still layers with top mem levels not figured out."
 
     def update_mem_level_for_loading_data(self):
         """

From 402499d6ffdb04e9474fd6d71fc2db49ce8396e6 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Sat, 21 Oct 2023 09:55:32 +0200
Subject: [PATCH 6/7] rename pytest folder name for original pytest cases:
 test_with_unused_memory -> test_origin

---
 .../test_ascend_like.py                       |   0
 .../test_edge_tpu_like.py                     |   0
 .../test_meta_prototype_like.py               |   0
 .../test_tesla_npu_like.py                    |   0
 .../test_tpu_like.py                          |   0
 .../SpatialMappingAutoGeneratorStage.py       | 274 ++++++++++++++++++
 6 files changed, 274 insertions(+)
 rename tests/main/{test_with_unused_memory => test_origin}/test_ascend_like.py (100%)
 rename tests/main/{test_with_unused_memory => test_origin}/test_edge_tpu_like.py (100%)
 rename tests/main/{test_with_unused_memory => test_origin}/test_meta_prototype_like.py (100%)
 rename tests/main/{test_with_unused_memory => test_origin}/test_tesla_npu_like.py (100%)
 rename tests/main/{test_with_unused_memory => test_origin}/test_tpu_like.py (100%)
 create mode 100644 zigzag/classes/stages/SpatialMappingAutoGeneratorStage.py

diff --git a/tests/main/test_with_unused_memory/test_ascend_like.py b/tests/main/test_origin/test_ascend_like.py
similarity index 100%
rename from tests/main/test_with_unused_memory/test_ascend_like.py
rename to tests/main/test_origin/test_ascend_like.py
diff --git a/tests/main/test_with_unused_memory/test_edge_tpu_like.py b/tests/main/test_origin/test_edge_tpu_like.py
similarity index 100%
rename from tests/main/test_with_unused_memory/test_edge_tpu_like.py
rename to tests/main/test_origin/test_edge_tpu_like.py
diff --git a/tests/main/test_with_unused_memory/test_meta_prototype_like.py b/tests/main/test_origin/test_meta_prototype_like.py
similarity index 100%
rename from tests/main/test_with_unused_memory/test_meta_prototype_like.py
rename to tests/main/test_origin/test_meta_prototype_like.py
diff --git a/tests/main/test_with_unused_memory/test_tesla_npu_like.py b/tests/main/test_origin/test_tesla_npu_like.py
similarity index 100%
rename from tests/main/test_with_unused_memory/test_tesla_npu_like.py
rename to tests/main/test_origin/test_tesla_npu_like.py
diff --git a/tests/main/test_with_unused_memory/test_tpu_like.py b/tests/main/test_origin/test_tpu_like.py
similarity index 100%
rename from tests/main/test_with_unused_memory/test_tpu_like.py
rename to tests/main/test_origin/test_tpu_like.py
diff --git a/zigzag/classes/stages/SpatialMappingAutoGeneratorStage.py b/zigzag/classes/stages/SpatialMappingAutoGeneratorStage.py
new file mode 100644
index 00000000..f98912ef
--- /dev/null
+++ b/zigzag/classes/stages/SpatialMappingAutoGeneratorStage.py
@@ -0,0 +1,274 @@
+import logging
+
+from zigzag.classes.hardware.architecture.accelerator import Accelerator
+from zigzag.classes.hardware.architecture.core import Core
+from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy
+from zigzag.classes.opt.spatial.autogenerator import UserSpatialMappingAutoGenerator
+from zigzag.classes.stages.Stage import Stage
+from zigzag.classes.stages.SpatialMappingMixConversionStage import (
+    SpatialMappingMixConversionStage,
+)
+import copy
+from zigzag.utils import pickle_deepcopy
+
+logger = logging.getLogger(__name__)
+
+
+## Pipeline stage that finds spatial mappings given a:
+# - accelerator
+# - core allocation
+# - interconnection pattern on the allocated core
+# - layer
+#
+# The spatial mappings are found using the interconnection pattern present on the core.
+#
+# The inner-most memory level served dimensions is used,
+# as this is how the memories connect to the operational array.
+class SpatialMappingAutoGeneratorStage(Stage):
+    ## The class constructor
+    # Note: list_of_callables does NOT need to include SpatialMappingConversionStage. Although this is used,
+    # this usage is done automatically.
+    def __init__(
+        self,
+        list_of_callables,
+        *,
+        accelerator,
+        layer,
+        enable_mix_sm,
+        enable_speedup,
+        enable_ox_unroll,
+        **kwargs,
+    ):
+        super().__init__(list_of_callables, **kwargs)
+        self.accelerator = accelerator
+        self.check_layer(layer)
+        self.layer = layer
+        self.enable_mix_sm = enable_mix_sm  # True: enable generating mix sm
+        self.enable_speedup = enable_speedup  # True: only keep 3 sm with the highest hardware utilization to speedup simulation time
+        self.enable_ox_unroll = enable_ox_unroll  # True: enable OX/OY unrolling when automatically generating sm
+
+    @staticmethod
+    # Check that the layer includes:
+    # - the core which it is allocated to
+    #
+    # If not, a ValueError is raised.
+    #
+    # If the layer in main_inputs is not set, False is returned
+    #
+    # @return: True if layer is set correctly
+    def check_layer(layer):
+        if layer is None:
+            raise ValueError()
+        if layer.core_allocation is None:
+            logger.critical(f"Layer {layer} has no core allocation.")
+            raise ValueError()
+        return True
+
+    ## Run this stage by generating user-formatted spatial mappings which are converted
+    # to the memory-level based spatial mapping representation.
+    def run(self, enable_ox_unroll=True):
+        # @ param enable_ox_unroll: True - will adjust input mem size if there is OX / OY mapping in the spatial mapping.
+        # Note: this param should be True if @param enable_ox_unroll in autogenerator.py is True
+        user_provided_spatial_mappings = self.layer.user_spatial_mapping
+        user_provided_spatial_mapping_hint = self.layer.user_spatial_mapping_hint
+        core_id = self.layer.core_allocation
+        oa_dims = self.accelerator.get_core(
+            core_id=core_id
+        ).operational_array.dimensions
+
+        if isinstance(
+            user_provided_spatial_mappings, dict
+        ):  # There is a single USM provided
+            user_spatial_mappings = [user_provided_spatial_mappings]
+        elif isinstance(
+            user_provided_spatial_mappings, list
+        ):  # There are multiple USMs provided
+            user_spatial_mappings = user_provided_spatial_mappings
+        else:  # There is no USM provided
+            # Initialize user_provided_spatial_mapping_hint
+            if user_provided_spatial_mapping_hint is None:
+                logger.warning(
+                    f"No user-provided spatial mappings or hint found. Auto-generating.."
+                )
+                user_provided_spatial_mapping_hint = {}
+                for oa_dim in oa_dims:
+                    user_provided_spatial_mapping_hint[oa_dim.name] = [
+                        layer_dim for layer_dim in self.layer.loop_dim_list
+                    ]
+                self.layer.user_spatial_mapping_hint = (
+                    user_provided_spatial_mapping_hint
+                )
+            else:
+                # Check if every oa_dim is in user_provided_spatial_mapping_hint. Completion for not-existed hint.
+                for oa_dim in oa_dims:
+                    if oa_dim.name not in user_provided_spatial_mapping_hint.keys():
+                        user_provided_spatial_mapping_hint[oa_dim.name] = [
+                            layer_dim for layer_dim in self.layer.loop_dim_list
+                        ]
+                logger.debug(
+                    f"No user-provided spatial mappings found, but hint found. Auto-generating.."
+                )
+            # Initialize the UserSpatialMappingGenerator which will automatically generate SMs
+            user_spatial_mapping_generator = UserSpatialMappingAutoGenerator(
+                self.layer,
+                self.accelerator,
+                self.enable_mix_sm,
+                self.enable_speedup,
+                self.enable_ox_unroll,
+            )
+            # Get all the USMs by running the generator
+            user_spatial_mappings = list(
+                (usm for usm in user_spatial_mapping_generator.run())
+            )
+
+        nb_user_spatial_mappings = len(user_spatial_mappings)
+
+        for i, user_spatial_mapping in enumerate(user_spatial_mappings):
+            logger.info(f"Launching spatial mapping {i+1}/{nb_user_spatial_mappings}:")
+            # Set the user_spatial_mapping in the layer, as this is required by SpatialMappingConversionStage
+            self.layer.user_spatial_mapping = user_spatial_mapping
+            # Note: manual instantiation of spatial mapping conversion stage here. We let that class deal with
+            # everything else, including instantion of the actual substages
+
+            # TODO: [jiacong] [ADD] modify the size of lower input mem to support OX, OY spatial unrolling
+            # enable_ox_unroll: True - will adjust input mem size if there is OX / OY mapping in the spatial mapping.
+            if enable_ox_unroll:
+                # if True, get the new accelerator and the flag of telling if input mem size will be scaled
+                # @param update_input_mem_size: True - input mem scaling is required, so accelerator will be modified.
+                (
+                    update_input_mem_size,
+                    new_accelerator,
+                ) = self.modify_innermost_input_mem_size(core_id, user_spatial_mapping)
+            if enable_ox_unroll and update_input_mem_size:
+                original_accelerator = self.accelerator
+                spatial_mapping_conversion_stage = SpatialMappingMixConversionStage(
+                    self.list_of_callables,
+                    accelerator=new_accelerator,
+                    layer=copy.copy(self.layer),
+                    **self.kwargs,
+                )
+            else:
+                # TODO: [jiacong] [FINISH]
+
+                spatial_mapping_conversion_stage = SpatialMappingMixConversionStage(
+                    self.list_of_callables,
+                    accelerator=self.accelerator,
+                    layer=copy.copy(self.layer),
+                    **self.kwargs,
+                )
+
+            for cme, extra_info in spatial_mapping_conversion_stage.run():
+                # TODO: [jiacong] [ADD] recover accelerator if its mem size is adjusted before
+                if enable_ox_unroll and update_input_mem_size:
+                    cme.accelerator = original_accelerator
+                # TODO: [jiacong] [FINISH]
+                yield cme, (user_spatial_mapping, extra_info)
+
+    ## Modify memory size of the innermost input mem to support OX, OY unrolling
+    def modify_innermost_input_mem_size(self, core_id, user_spatial_mapping):
+        # To support OX, OY unrolling, we will scale the lowest input mem size by OXu*OYu
+        # to avoid the MemoryTooSmallException in loma stage.
+        core = self.accelerator.get_core(core_id=core_id)
+        operational_array = core.operational_array
+        oa_dims = operational_array.dimensions
+        memory_hierarchy = copy.deepcopy(core.memory_hierarchy)
+        innermost_levels = memory_hierarchy.get_inner_memories()
+        # get the link from layer op to mem op
+        layer_op_to_mem_op: dict = self.layer.memory_operand_links
+        # get weight operand name
+        const_operand = self.layer.constant_operands[0]  # weight representation
+        # get activation operand name
+        act_operand = [
+            operand for operand in self.layer.input_operands if operand != const_operand
+        ][0]
+        # get name of OX, OY (weight ir layer dims)
+        weight_ir_layer_dims: list = self.layer.operand_loop_dim[const_operand]["ir"]
+        # get the oa_dim name served by input innermost memory level
+        for memory_level in innermost_levels:
+            mem_ops = memory_level.operands
+            if layer_op_to_mem_op[act_operand] in mem_ops:
+                act_innermost_mem_level = memory_level
+                act_served_oa_dim: set = memory_level.served_dimensions
+                act_served_oa_dim_name = list(act_served_oa_dim)[0].name
+        # get the mem scaling factor if OX, OY exist
+        mem_scaling_factor = 1
+        try:
+            if (
+                act_served_oa_dim_name not in user_spatial_mapping.keys()
+            ):  # there is no sm loop
+                pass
+            else:  # there is sm loop on act served oa dim
+                act_served_oa_mapping = user_spatial_mapping[act_served_oa_dim_name]
+                if isinstance(
+                    act_served_oa_mapping[0], str
+                ):  # a single layer dim mapping
+                    layer_dim = act_served_oa_mapping[0]
+                    if layer_dim in weight_ir_layer_dims:
+                        layer_size = act_served_oa_mapping[1]
+                        mem_scaling_factor *= layer_size
+                else:  # a mix sm mapping, e.g. (("K", 2), ("OX", 5))
+                    for element in act_served_oa_mapping:
+                        layer_dim = element[0]
+                        if layer_dim in weight_ir_layer_dims:
+                            layer_size = element[1]
+                            mem_scaling_factor *= layer_size
+        except (
+            UnboundLocalError
+        ):  # except when act_layer_dim is not served in the innermost mems
+            pass
+        # scale the mem size
+        if mem_scaling_factor == 1:
+            # No need to change the input mem size
+            update_input_mem_size = False
+            return update_input_mem_size, self.accelerator
+        else:
+            update_input_mem_size = True
+            # Initialize the new memory hierarchy
+            mh_name = memory_hierarchy.name
+            new_mh_name = mh_name + "-supporting-diagonal-map"
+            new_memory_hierarchy = MemoryHierarchy(operational_array, new_mh_name)
+            # Add memories to the new memory hierarchy with the correct attributes
+            for curr_mem_level, memory_level in enumerate(
+                memory_hierarchy.mem_level_list
+            ):
+                memory_instance = memory_level.memory_instance
+                if memory_level == act_innermost_mem_level:
+                    memory_instance.size *= mem_scaling_factor  # scale here. For others, keep them unchanged.
+                operands = tuple(memory_level.operands)
+                port_alloc = memory_level.port_alloc_raw
+                served_dimensions_vec = memory_level.served_dimensions_vec
+                assert len(served_dimensions_vec) >= 1
+                served_dimensions = served_dimensions_vec[0]
+
+                new_memory_instance = pickle_deepcopy(memory_instance)
+                new_operands = pickle_deepcopy(operands)
+                new_port_alloc = pickle_deepcopy(port_alloc)
+                new_served_dimensions = pickle_deepcopy(served_dimensions)
+                new_memory_hierarchy.add_memory(
+                    memory_instance=new_memory_instance,
+                    operands=new_operands,
+                    port_alloc=new_port_alloc,
+                    served_dimensions=new_served_dimensions,
+                )
+            # Create the new core
+            id = core.id
+            dataflows = core.dataflows
+            new_id = id
+            new_dataflows = pickle_deepcopy(dataflows)
+
+            new_core = Core(
+                id=new_id,
+                operational_array=operational_array,
+                memory_hierarchy=new_memory_hierarchy,
+                dataflows=new_dataflows,
+            )
+
+            # Create the new accelerator
+            name = self.accelerator.name
+            new_name = name + "-scaled"
+            new_cores = {new_core}
+            new_accelerator = Accelerator(
+                name=new_name,
+                core_set=new_cores,
+            )
+            return update_input_mem_size, new_accelerator

From aeb74ab5e4996d076c9535aa3d2a8c91dc0c4565 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Sat, 21 Oct 2023 09:58:48 +0200
Subject: [PATCH 7/7] remove unuseful file

---
 .../SpatialMappingAutoGeneratorStage.py       | 274 ------------------
 1 file changed, 274 deletions(-)
 delete mode 100644 zigzag/classes/stages/SpatialMappingAutoGeneratorStage.py

diff --git a/zigzag/classes/stages/SpatialMappingAutoGeneratorStage.py b/zigzag/classes/stages/SpatialMappingAutoGeneratorStage.py
deleted file mode 100644
index f98912ef..00000000
--- a/zigzag/classes/stages/SpatialMappingAutoGeneratorStage.py
+++ /dev/null
@@ -1,274 +0,0 @@
-import logging
-
-from zigzag.classes.hardware.architecture.accelerator import Accelerator
-from zigzag.classes.hardware.architecture.core import Core
-from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy
-from zigzag.classes.opt.spatial.autogenerator import UserSpatialMappingAutoGenerator
-from zigzag.classes.stages.Stage import Stage
-from zigzag.classes.stages.SpatialMappingMixConversionStage import (
-    SpatialMappingMixConversionStage,
-)
-import copy
-from zigzag.utils import pickle_deepcopy
-
-logger = logging.getLogger(__name__)
-
-
-## Pipeline stage that finds spatial mappings given a:
-# - accelerator
-# - core allocation
-# - interconnection pattern on the allocated core
-# - layer
-#
-# The spatial mappings are found using the interconnection pattern present on the core.
-#
-# The inner-most memory level served dimensions is used,
-# as this is how the memories connect to the operational array.
-class SpatialMappingAutoGeneratorStage(Stage):
-    ## The class constructor
-    # Note: list_of_callables does NOT need to include SpatialMappingConversionStage. Although this is used,
-    # this usage is done automatically.
-    def __init__(
-        self,
-        list_of_callables,
-        *,
-        accelerator,
-        layer,
-        enable_mix_sm,
-        enable_speedup,
-        enable_ox_unroll,
-        **kwargs,
-    ):
-        super().__init__(list_of_callables, **kwargs)
-        self.accelerator = accelerator
-        self.check_layer(layer)
-        self.layer = layer
-        self.enable_mix_sm = enable_mix_sm  # True: enable generating mix sm
-        self.enable_speedup = enable_speedup  # True: only keep 3 sm with the highest hardware utilization to speedup simulation time
-        self.enable_ox_unroll = enable_ox_unroll  # True: enable OX/OY unrolling when automatically generating sm
-
-    @staticmethod
-    # Check that the layer includes:
-    # - the core which it is allocated to
-    #
-    # If not, a ValueError is raised.
-    #
-    # If the layer in main_inputs is not set, False is returned
-    #
-    # @return: True if layer is set correctly
-    def check_layer(layer):
-        if layer is None:
-            raise ValueError()
-        if layer.core_allocation is None:
-            logger.critical(f"Layer {layer} has no core allocation.")
-            raise ValueError()
-        return True
-
-    ## Run this stage by generating user-formatted spatial mappings which are converted
-    # to the memory-level based spatial mapping representation.
-    def run(self, enable_ox_unroll=True):
-        # @ param enable_ox_unroll: True - will adjust input mem size if there is OX / OY mapping in the spatial mapping.
-        # Note: this param should be True if @param enable_ox_unroll in autogenerator.py is True
-        user_provided_spatial_mappings = self.layer.user_spatial_mapping
-        user_provided_spatial_mapping_hint = self.layer.user_spatial_mapping_hint
-        core_id = self.layer.core_allocation
-        oa_dims = self.accelerator.get_core(
-            core_id=core_id
-        ).operational_array.dimensions
-
-        if isinstance(
-            user_provided_spatial_mappings, dict
-        ):  # There is a single USM provided
-            user_spatial_mappings = [user_provided_spatial_mappings]
-        elif isinstance(
-            user_provided_spatial_mappings, list
-        ):  # There are multiple USMs provided
-            user_spatial_mappings = user_provided_spatial_mappings
-        else:  # There is no USM provided
-            # Initialize user_provided_spatial_mapping_hint
-            if user_provided_spatial_mapping_hint is None:
-                logger.warning(
-                    f"No user-provided spatial mappings or hint found. Auto-generating.."
-                )
-                user_provided_spatial_mapping_hint = {}
-                for oa_dim in oa_dims:
-                    user_provided_spatial_mapping_hint[oa_dim.name] = [
-                        layer_dim for layer_dim in self.layer.loop_dim_list
-                    ]
-                self.layer.user_spatial_mapping_hint = (
-                    user_provided_spatial_mapping_hint
-                )
-            else:
-                # Check if every oa_dim is in user_provided_spatial_mapping_hint. Completion for not-existed hint.
-                for oa_dim in oa_dims:
-                    if oa_dim.name not in user_provided_spatial_mapping_hint.keys():
-                        user_provided_spatial_mapping_hint[oa_dim.name] = [
-                            layer_dim for layer_dim in self.layer.loop_dim_list
-                        ]
-                logger.debug(
-                    f"No user-provided spatial mappings found, but hint found. Auto-generating.."
-                )
-            # Initialize the UserSpatialMappingGenerator which will automatically generate SMs
-            user_spatial_mapping_generator = UserSpatialMappingAutoGenerator(
-                self.layer,
-                self.accelerator,
-                self.enable_mix_sm,
-                self.enable_speedup,
-                self.enable_ox_unroll,
-            )
-            # Get all the USMs by running the generator
-            user_spatial_mappings = list(
-                (usm for usm in user_spatial_mapping_generator.run())
-            )
-
-        nb_user_spatial_mappings = len(user_spatial_mappings)
-
-        for i, user_spatial_mapping in enumerate(user_spatial_mappings):
-            logger.info(f"Launching spatial mapping {i+1}/{nb_user_spatial_mappings}:")
-            # Set the user_spatial_mapping in the layer, as this is required by SpatialMappingConversionStage
-            self.layer.user_spatial_mapping = user_spatial_mapping
-            # Note: manual instantiation of spatial mapping conversion stage here. We let that class deal with
-            # everything else, including instantion of the actual substages
-
-            # TODO: [jiacong] [ADD] modify the size of lower input mem to support OX, OY spatial unrolling
-            # enable_ox_unroll: True - will adjust input mem size if there is OX / OY mapping in the spatial mapping.
-            if enable_ox_unroll:
-                # if True, get the new accelerator and the flag of telling if input mem size will be scaled
-                # @param update_input_mem_size: True - input mem scaling is required, so accelerator will be modified.
-                (
-                    update_input_mem_size,
-                    new_accelerator,
-                ) = self.modify_innermost_input_mem_size(core_id, user_spatial_mapping)
-            if enable_ox_unroll and update_input_mem_size:
-                original_accelerator = self.accelerator
-                spatial_mapping_conversion_stage = SpatialMappingMixConversionStage(
-                    self.list_of_callables,
-                    accelerator=new_accelerator,
-                    layer=copy.copy(self.layer),
-                    **self.kwargs,
-                )
-            else:
-                # TODO: [jiacong] [FINISH]
-
-                spatial_mapping_conversion_stage = SpatialMappingMixConversionStage(
-                    self.list_of_callables,
-                    accelerator=self.accelerator,
-                    layer=copy.copy(self.layer),
-                    **self.kwargs,
-                )
-
-            for cme, extra_info in spatial_mapping_conversion_stage.run():
-                # TODO: [jiacong] [ADD] recover accelerator if its mem size is adjusted before
-                if enable_ox_unroll and update_input_mem_size:
-                    cme.accelerator = original_accelerator
-                # TODO: [jiacong] [FINISH]
-                yield cme, (user_spatial_mapping, extra_info)
-
-    ## Modify memory size of the innermost input mem to support OX, OY unrolling
-    def modify_innermost_input_mem_size(self, core_id, user_spatial_mapping):
-        # To support OX, OY unrolling, we will scale the lowest input mem size by OXu*OYu
-        # to avoid the MemoryTooSmallException in loma stage.
-        core = self.accelerator.get_core(core_id=core_id)
-        operational_array = core.operational_array
-        oa_dims = operational_array.dimensions
-        memory_hierarchy = copy.deepcopy(core.memory_hierarchy)
-        innermost_levels = memory_hierarchy.get_inner_memories()
-        # get the link from layer op to mem op
-        layer_op_to_mem_op: dict = self.layer.memory_operand_links
-        # get weight operand name
-        const_operand = self.layer.constant_operands[0]  # weight representation
-        # get activation operand name
-        act_operand = [
-            operand for operand in self.layer.input_operands if operand != const_operand
-        ][0]
-        # get name of OX, OY (weight ir layer dims)
-        weight_ir_layer_dims: list = self.layer.operand_loop_dim[const_operand]["ir"]
-        # get the oa_dim name served by input innermost memory level
-        for memory_level in innermost_levels:
-            mem_ops = memory_level.operands
-            if layer_op_to_mem_op[act_operand] in mem_ops:
-                act_innermost_mem_level = memory_level
-                act_served_oa_dim: set = memory_level.served_dimensions
-                act_served_oa_dim_name = list(act_served_oa_dim)[0].name
-        # get the mem scaling factor if OX, OY exist
-        mem_scaling_factor = 1
-        try:
-            if (
-                act_served_oa_dim_name not in user_spatial_mapping.keys()
-            ):  # there is no sm loop
-                pass
-            else:  # there is sm loop on act served oa dim
-                act_served_oa_mapping = user_spatial_mapping[act_served_oa_dim_name]
-                if isinstance(
-                    act_served_oa_mapping[0], str
-                ):  # a single layer dim mapping
-                    layer_dim = act_served_oa_mapping[0]
-                    if layer_dim in weight_ir_layer_dims:
-                        layer_size = act_served_oa_mapping[1]
-                        mem_scaling_factor *= layer_size
-                else:  # a mix sm mapping, e.g. (("K", 2), ("OX", 5))
-                    for element in act_served_oa_mapping:
-                        layer_dim = element[0]
-                        if layer_dim in weight_ir_layer_dims:
-                            layer_size = element[1]
-                            mem_scaling_factor *= layer_size
-        except (
-            UnboundLocalError
-        ):  # except when act_layer_dim is not served in the innermost mems
-            pass
-        # scale the mem size
-        if mem_scaling_factor == 1:
-            # No need to change the input mem size
-            update_input_mem_size = False
-            return update_input_mem_size, self.accelerator
-        else:
-            update_input_mem_size = True
-            # Initialize the new memory hierarchy
-            mh_name = memory_hierarchy.name
-            new_mh_name = mh_name + "-supporting-diagonal-map"
-            new_memory_hierarchy = MemoryHierarchy(operational_array, new_mh_name)
-            # Add memories to the new memory hierarchy with the correct attributes
-            for curr_mem_level, memory_level in enumerate(
-                memory_hierarchy.mem_level_list
-            ):
-                memory_instance = memory_level.memory_instance
-                if memory_level == act_innermost_mem_level:
-                    memory_instance.size *= mem_scaling_factor  # scale here. For others, keep them unchanged.
-                operands = tuple(memory_level.operands)
-                port_alloc = memory_level.port_alloc_raw
-                served_dimensions_vec = memory_level.served_dimensions_vec
-                assert len(served_dimensions_vec) >= 1
-                served_dimensions = served_dimensions_vec[0]
-
-                new_memory_instance = pickle_deepcopy(memory_instance)
-                new_operands = pickle_deepcopy(operands)
-                new_port_alloc = pickle_deepcopy(port_alloc)
-                new_served_dimensions = pickle_deepcopy(served_dimensions)
-                new_memory_hierarchy.add_memory(
-                    memory_instance=new_memory_instance,
-                    operands=new_operands,
-                    port_alloc=new_port_alloc,
-                    served_dimensions=new_served_dimensions,
-                )
-            # Create the new core
-            id = core.id
-            dataflows = core.dataflows
-            new_id = id
-            new_dataflows = pickle_deepcopy(dataflows)
-
-            new_core = Core(
-                id=new_id,
-                operational_array=operational_array,
-                memory_hierarchy=new_memory_hierarchy,
-                dataflows=new_dataflows,
-            )
-
-            # Create the new accelerator
-            name = self.accelerator.name
-            new_name = name + "-scaled"
-            new_cores = {new_core}
-            new_accelerator = Accelerator(
-                name=new_name,
-                core_set=new_cores,
-            )
-            return update_input_mem_size, new_accelerator