diff --git a/main.py b/main.py
index 14f13214..8f6dee76 100644
--- a/main.py
+++ b/main.py
@@ -49,4 +49,4 @@
 )
 
 # Launch the MainStage
-mainstage.run()
\ No newline at end of file
+mainstage.run()
diff --git a/main_onnx.py b/main_onnx.py
index 526abd29..de286087 100644
--- a/main_onnx.py
+++ b/main_onnx.py
@@ -45,4 +45,4 @@
 )
 
 # Launch the MainStage
-mainstage.run()
\ No newline at end of file
+mainstage.run()
diff --git a/tests/main/test_ascend_like.py b/tests/main/test_origin/test_ascend_like.py
similarity index 100%
rename from tests/main/test_ascend_like.py
rename to tests/main/test_origin/test_ascend_like.py
diff --git a/tests/main/test_edge_tpu_like.py b/tests/main/test_origin/test_edge_tpu_like.py
similarity index 100%
rename from tests/main/test_edge_tpu_like.py
rename to tests/main/test_origin/test_edge_tpu_like.py
diff --git a/tests/main/test_meta_prototype_like.py b/tests/main/test_origin/test_meta_prototype_like.py
similarity index 100%
rename from tests/main/test_meta_prototype_like.py
rename to tests/main/test_origin/test_meta_prototype_like.py
diff --git a/tests/main/test_tesla_npu_like.py b/tests/main/test_origin/test_tesla_npu_like.py
similarity index 100%
rename from tests/main/test_tesla_npu_like.py
rename to tests/main/test_origin/test_tesla_npu_like.py
diff --git a/tests/main/test_tpu_like.py b/tests/main/test_origin/test_tpu_like.py
similarity index 100%
rename from tests/main/test_tpu_like.py
rename to tests/main/test_origin/test_tpu_like.py
diff --git a/tests/main/test_without_unused_memory/test_ascend_like.py b/tests/main/test_without_unused_memory/test_ascend_like.py
new file mode 100644
index 00000000..4eee129a
--- /dev/null
+++ b/tests/main/test_without_unused_memory/test_ascend_like.py
@@ -0,0 +1,38 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    "zigzag/inputs/examples/workload/alexnet.onnx",
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    "zigzag/inputs/examples/workload/resnet18.onnx",
+    "zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5649555894.9, 8637780),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1881386179.71, 6486685),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1709089377.83, 3583047),
+    "zigzag.inputs.examples.workload.resnet18": (2243493483.15, 4657130),
+}
+
+
+@pytest.fixture
+def mapping():
+    return "zigzag.inputs.examples.mapping.ascend_like"
+
+
+@pytest.fixture
+def accelerator():
+    return "zigzag.inputs.examples.hardware.Ascend_like"
+
+
+@pytest.mark.parametrize("workload", workloads)
+def test_api(workload, accelerator, mapping):
+    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+    (expected_energy, expected_latency) = ens_lats[workload]
+    assert energy == pytest.approx(expected_energy)
+    assert latency == pytest.approx(expected_latency)
diff --git a/tests/main/test_without_unused_memory/test_edge_tpu_like.py b/tests/main/test_without_unused_memory/test_edge_tpu_like.py
new file mode 100644
index 00000000..4b06d5de
--- /dev/null
+++ b/tests/main/test_without_unused_memory/test_edge_tpu_like.py
@@ -0,0 +1,38 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    "zigzag/inputs/examples/workload/alexnet.onnx",
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    "zigzag/inputs/examples/workload/resnet18.onnx",
+    "zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5568602396.684999, 8134431),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (751128562.4699999, 2427487),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1784539639.4349997, 3176546),
+    "zigzag.inputs.examples.workload.resnet18": (2115122870.395, 3884789),
+}
+
+
+@pytest.fixture
+def mapping():
+    return "zigzag.inputs.examples.mapping.edge_tpu_like"
+
+
+@pytest.fixture
+def accelerator():
+    return "zigzag.inputs.examples.hardware.Edge_TPU_like"
+
+
+@pytest.mark.parametrize("workload", workloads)
+def test_api(workload, accelerator, mapping):
+    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+    (expected_energy, expected_latency) = ens_lats[workload]
+    assert energy == pytest.approx(expected_energy)
+    assert latency == pytest.approx(expected_latency)
diff --git a/tests/main/test_without_unused_memory/test_meta_prototype_like.py b/tests/main/test_without_unused_memory/test_meta_prototype_like.py
new file mode 100644
index 00000000..e2594f42
--- /dev/null
+++ b/tests/main/test_without_unused_memory/test_meta_prototype_like.py
@@ -0,0 +1,38 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    "zigzag/inputs/examples/workload/alexnet.onnx",
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    "zigzag/inputs/examples/workload/resnet18.onnx",
+    "zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5679695605.4400015, 8299150),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (901092009.6000001, 2610609),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1730672410.3200004, 3262009),
+    "zigzag.inputs.examples.workload.resnet18": (2265438430.2299995, 4017227),
+}
+
+
+@pytest.fixture
+def mapping():
+    return "zigzag.inputs.examples.mapping.meta_prototype_like"
+
+
+@pytest.fixture
+def accelerator():
+    return "zigzag.inputs.examples.hardware.Meta_prototype"
+
+
+@pytest.mark.parametrize("workload", workloads)
+def test_api(workload, accelerator, mapping):
+    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+    (expected_energy, expected_latency) = ens_lats[workload]
+    assert energy == pytest.approx(expected_energy)
+    assert latency == pytest.approx(expected_latency)
diff --git a/tests/main/test_without_unused_memory/test_tesla_npu_like.py b/tests/main/test_without_unused_memory/test_tesla_npu_like.py
new file mode 100644
index 00000000..25eb9648
--- /dev/null
+++ b/tests/main/test_without_unused_memory/test_tesla_npu_like.py
@@ -0,0 +1,38 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    "zigzag/inputs/examples/workload/alexnet.onnx",
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    "zigzag/inputs/examples/workload/resnet18.onnx",
+    "zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (6040086796.366001, 8389669),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (930702060.6110002, 1965457),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681.4799998, 3257898),
+    "zigzag.inputs.examples.workload.resnet18": (2220861655.6660004, 3934616),
+}
+
+
+@pytest.fixture
+def mapping():
+    return "zigzag.inputs.examples.mapping.tesla_npu_like"
+
+
+@pytest.fixture
+def accelerator():
+    return "zigzag.inputs.examples.hardware.Tesla_NPU_like"
+
+
+@pytest.mark.parametrize("workload", workloads)
+def test_api(workload, accelerator, mapping):
+    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+    (expected_energy, expected_latency) = ens_lats[workload]
+    assert energy == pytest.approx(expected_energy)
+    assert latency == pytest.approx(expected_latency)
diff --git a/tests/main/test_without_unused_memory/test_tpu_like.py b/tests/main/test_without_unused_memory/test_tpu_like.py
new file mode 100644
index 00000000..28df3fa1
--- /dev/null
+++ b/tests/main/test_without_unused_memory/test_tpu_like.py
@@ -0,0 +1,38 @@
+import pytest
+
+from zigzag.api import get_hardware_performance_zigzag_without_unused_memory
+
+workloads = (
+    "zigzag/inputs/examples/workload/alexnet.onnx",
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx",
+    "zigzag/inputs/examples/workload/resnet18.onnx",
+    "zigzag.inputs.examples.workload.resnet18",
+)
+
+# Expected energy and latency for each workload defined above
+ens_lats = {
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5475639384.492001, 8979956),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (952688145.0069999, 21873214),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1659252422.016, 4000289),
+    "zigzag.inputs.examples.workload.resnet18": (1982830786.5119998, 4509235),
+}
+
+
+@pytest.fixture
+def mapping():
+    return "zigzag.inputs.examples.mapping.tpu_like"
+
+
+@pytest.fixture
+def accelerator():
+    return "zigzag.inputs.examples.hardware.TPU_like"
+
+
+@pytest.mark.parametrize("workload", workloads)
+def test_api(workload, accelerator, mapping):
+    (energy, latency, cmes) = get_hardware_performance_zigzag_without_unused_memory(
+        workload, accelerator, mapping
+    )
+    (expected_energy, expected_latency) = ens_lats[workload]
+    assert energy == pytest.approx(expected_energy)
+    assert latency == pytest.approx(expected_latency)
diff --git a/zigzag/api.py b/zigzag/api.py
index c1b8cb25..4e0d8f37 100644
--- a/zigzag/api.py
+++ b/zigzag/api.py
@@ -155,6 +155,82 @@ def get_hardware_performance_zigzag_pe_array_scaling(
     return cmes[0][0].energy_total, cmes[0][0].latency_total2, cmes
 
 
+def get_hardware_performance_zigzag_without_unused_memory(
+    workload,
+    accelerator,
+    mapping,
+    opt="latency",
+    dump_filename_pattern="outputs/{datetime}.json",
+    pickle_filename="outputs/list_of_cmes.pickle",
+):
+    # Initialize the logger
+    import logging as _logging
+
+    _logging_level = _logging.INFO
+    _logging_format = (
+        "%(asctime)s - %(funcName)s +%(lineno)s - %(levelname)s - %(message)s"
+    )
+    _logging.basicConfig(level=_logging_level, format=_logging_format)
+
+    # Sanity check on the optimization criterion
+    if opt == "energy":
+        opt_stage = MinimalEnergyStage
+    elif opt == "latency":
+        opt_stage = MinimalLatencyStage
+    elif opt == "EDP":
+        opt_stage = MinimalEDPStage
+    else:
+        raise NotImplementedError(
+            "Optimization criterion 'opt' should be either 'energy' or 'latency' or 'EDP'."
+        )
+
+    # Check workload format and based on it select the correct workload parser stage
+    try:
+        if workload.split(".")[-1] == "onnx":
+            workload_parser_stage = ONNXModelParserStage
+        else:
+            workload_parser_stage = WorkloadParserStage
+    except:
+        workload_parser_stage = WorkloadParserStage
+
+    mainstage = MainStage(
+        [  # Initialize the MainStage as entry point
+            workload_parser_stage,  # Parse the ONNX Model into the workload
+            AcceleratorParserStage,  # Parse the accelerator module/passthrough given accelerator
+            SimpleSaveStage,  # Save the summed CME energy and latency to a json
+            PickleSaveStage,  # Save all received CMEs in a list to a pickle file
+            SumStage,  # Sum up the received best CME across all layers of the workload
+            SearchUnusedMemoryStage,  # Search for unused memory instance
+            WorkloadStage,  # Iterate through the different layers in the workload
+            RemoveUnusedMemoryStage,  # Remove unused memory instance
+            CompleteSaveStage,  # Save each processed layer to a json
+            opt_stage,  # Reduce all CMEs, returning minimal energy/latency one
+            SpatialMappingGeneratorStage,  # Generate multiple spatial mappings (SM)
+            opt_stage,  # Reduce all CMEs, returning minimal energy/latency one
+            LomaStage,  # Generate multiple temporal mappings (TM)
+            # TemporalOrderingConversionStage,  # Based on the fixed temporal mapping order, generate one temporal mapping (TM)
+            CostModelStage,  # Evaluate generated SM and TM through cost model
+        ],
+        accelerator=accelerator,  # required by AcceleratorParserStage
+        workload=workload,  # required by workload_parser_stage
+        mapping=mapping,  # required by workload_parser_stage
+        dump_filename_pattern=dump_filename_pattern,  # output file save pattern
+        pickle_filename=pickle_filename,  # filename for pickled list of cmes
+        loma_lpf_limit=6,  # required by LomaStage
+        loma_show_progress_bar=True,
+        # If we need access the same input data multiple times from the innermost memory level and the data size is smaller than the memory read bw,
+        # take into account only one-time access cost (assume the data can stay at the output pins of the memory as long as it is needed).
+        # By default, if the parameter is not defined, it will be set as False internally.
+        access_same_data_considered_as_no_access=True,
+    )
+
+    # Launch the MainStage
+    answers = mainstage.run()
+    # Get CME from answer
+    cmes = answers
+
+    return cmes[0][0].energy_total, cmes[0][0].latency_total2, cmes
+
 if __name__ == "__main__":
     workload = "zigzag/inputs/examples/workload/mobilenetv2.onnx"
     # workload = 'inputs.examples.workload.resnet18'
diff --git a/zigzag/classes/stages/RemoveUnusedMemoryStage.py b/zigzag/classes/stages/RemoveUnusedMemoryStage.py
new file mode 100644
index 00000000..f0d73eac
--- /dev/null
+++ b/zigzag/classes/stages/RemoveUnusedMemoryStage.py
@@ -0,0 +1,216 @@
+from zigzag.classes.hardware.architecture.accelerator import Accelerator
+from zigzag.classes.hardware.architecture.core import Core
+from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy
+from zigzag.utils import pickle_deepcopy
+from zigzag.classes.stages.Stage import Stage
+from typing import Generator
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+#################### Description ####################
+## This stage must be processed behind WorkloadStage.
+## This stage removes unused memory level found by SearchUnusedMemoryStage.
+################### Pseudo-code ####################
+## Initialization:
+##  target_act_mem_level, target_output_mem_level: get from mem_update_list
+##  target_const_mem_level = mem_udpate_weight
+## 1. Modify mem structure:
+## for mem in mem_levels(sort_order: from bottom to top):
+##   if ['I'] in mem.served_operand and mem.mem_level > target_act_mem_level:
+##     remove ['I'] in mem.served_operand, mem_port_alloc
+##   if ['O'] in mem.served_operand and mem.mem_level > target_output_mem_level:
+##     remove ['O'] in mem.served_operand, mem_port_alloc
+##   if ['W'] in mem.served_operand and mem.mem_level > target_const_mem_level:
+##     remove ['W'] in mem.served_operand, mem_port_alloc
+## 2. Remove unused memory
+## for mem in mem_levels(sort_order: from top to bottom):
+##   if mem.served_operand == empty:
+##     do not add the current mem into the modified architecture
+#####################################################
+
+
+class RemoveUnusedMemoryStage(Stage):
+    def __init__(
+        self,
+        list_of_callables,
+        *,
+        accelerator,
+        layer,
+        mem_update_list,
+        mem_update_weight,
+        layer_list,
+        **kwargs,
+    ):
+        super().__init__(list_of_callables, **kwargs)
+        self.accelerator = accelerator
+        self.layer = layer
+        self.layer_list = layer_list
+        self.mem_update_list = mem_update_list
+        self.mem_update_weight = mem_update_weight
+
+    def run(self) -> Generator:
+        modified_accelerator = self.generate_accelerator_with_removing_unused_memory()
+        sub_stage = self.list_of_callables[0](
+            self.list_of_callables[1:],
+            accelerator=modified_accelerator,
+            layer=self.layer,
+            **self.kwargs,
+        )
+        for cme, extra_info in sub_stage.run():
+            yield cme, extra_info
+
+    def generate_accelerator_with_removing_unused_memory(self):
+        ## Remove nouse memory level according to update_mem_list and mem_update_weight
+        curr_id = self.layer_list[
+            self.layer
+        ]  # current layer id (key) in mem_udpate_list
+        curr_id = str(curr_id)
+        output_operand = self.layer.memory_operand_links[
+            self.layer.output_operand
+        ]  # output representation in memory
+        core = next(iter(self.accelerator.cores))
+        operational_array = core.operational_array
+        memory_hierarchy = core.memory_hierarchy
+
+        if len(self.layer.constant_operands) == 1:
+            act_operand = self.layer.memory_operand_links[
+                [
+                    operand
+                    for operand in self.layer.input_operands
+                    if operand not in self.layer.constant_operands
+                ][0]
+            ]  # act representation in memory
+            const_operand = self.layer.memory_operand_links[
+                self.layer.constant_operands[0]
+            ]  # weight representation in memory
+        elif len(self.layer.constant_operands) == 0:
+            # special case when defining workload manually:
+            # the constant operands list is empty for such as "Adder" layers
+            # for input operand, we will represent all inputs as one input, since only their data size is used for required mem size calculation.
+            act_operand = self.layer.memory_operand_links[
+                self.layer.input_operands[0]
+            ]  # act representation in memory
+            const_operand = self.layer.memory_operand_links[
+                self.layer.input_operands[1]
+            ]  # weight representation in memory
+        else:
+            # special case when defining workload manually:
+            # both I and W are considered as constant operands for the first layer
+            pr_loop_keys = tuple(self.layer.pr_loop.keys())
+            for (
+                operand,
+                related_loop,
+            ) in self.layer.operand_dimensionality_order.items():
+                if pr_loop_keys[0] in related_loop:
+                    act_operand = operand
+            weight_operand: list = [
+                x for x in self.layer.constant_operands if x != act_operand
+            ]  # weight representation in layer
+            assert len(weight_operand) == 1
+            weight_operand: str = weight_operand[0]
+            act_operand = self.layer.memory_operand_links[
+                act_operand
+            ]  # map from layer representation to hardware memory representation
+            const_operand = self.layer.memory_operand_links[
+                weight_operand
+            ]  # weight representation in memory
+
+        # Find target_act/const/output_mem_level
+        for pos, ele in enumerate(self.mem_update_list[curr_id]):
+            if list(ele.keys())[0] == act_operand:
+                target_act_mem_level = self.mem_update_list[curr_id][pos][act_operand]
+            if list(ele.keys())[0] == output_operand:
+                target_output_mem_level = self.mem_update_list[curr_id][pos][
+                    output_operand
+                ]
+        if len(self.layer.constant_operands) == 0:
+            # special case when defining workload manually:
+            # the constant operands list is empty for such as "Adder" layers
+            # Here we make a trick: treating the other input as const_operand
+            for pos, ele in enumerate(self.mem_update_list[curr_id]):
+                if list(ele.keys())[0] == act_operand:
+                    target_const_mem_level = self.mem_update_list[curr_id][pos][
+                        act_operand
+                    ]
+        else:
+            target_const_mem_level = self.mem_update_weight
+
+        # Initialize the new memory hierarchy
+        mh_name = memory_hierarchy.name
+        new_mh_name = mh_name + "-without-unused-memory"
+        new_memory_hierarchy = MemoryHierarchy(operational_array, new_mh_name)
+
+        # Add memories to the new memory hierarchy with the correct attributes
+        for curr_mem_level, memory_level in enumerate(memory_hierarchy.mem_level_list):
+            memory_instance = memory_level.memory_instance
+            operands = tuple(memory_level.operands)
+            port_alloc = memory_level.port_alloc_raw
+            served_dimensions_vec = memory_level.served_dimensions_vec
+            assert len(served_dimensions_vec) >= 1
+            served_dimensions = served_dimensions_vec[0]
+
+            new_memory_instance = pickle_deepcopy(memory_instance)
+            new_operands = []
+            new_port_alloc = []
+            if (act_operand in operands) and curr_mem_level <= target_act_mem_level:
+                new_operands.append(act_operand)
+                index_in_operands = operands.index(act_operand)
+                new_port_alloc.append(port_alloc[index_in_operands])
+            if (const_operand in operands) and curr_mem_level <= target_const_mem_level:
+                new_operands.append(const_operand)
+                index_in_operands = operands.index(const_operand)
+                new_port_alloc.append(port_alloc[index_in_operands])
+            if (
+                output_operand in operands
+            ) and curr_mem_level <= target_output_mem_level:
+                new_operands.append(output_operand)
+                index_in_operands = operands.index(output_operand)
+                new_port_alloc.append(port_alloc[index_in_operands])
+            new_operands = tuple(new_operands)
+            new_port_alloc = tuple(new_port_alloc)
+            new_served_dimensions = pickle_deepcopy(served_dimensions)
+            if len(new_operands) > 0:
+                new_memory_hierarchy.add_memory(
+                    memory_instance=new_memory_instance,
+                    operands=new_operands,
+                    port_alloc=new_port_alloc,
+                    served_dimensions=new_served_dimensions,
+                )
+
+        # Create the new core
+        id = core.id
+        dataflows = core.dataflows
+        new_id = id
+        new_dataflows = pickle_deepcopy(dataflows)
+        new_core = Core(
+            id=new_id,
+            operational_array=operational_array,
+            memory_hierarchy=new_memory_hierarchy,
+            dataflows=new_dataflows,
+        )
+
+        # Create the new accelerator
+        name = self.accelerator.name
+        new_name = name + "-removing-nouse-mem"
+        new_cores = {new_core}
+        new_accelerator = Accelerator(
+            name=new_name,
+            core_set=new_cores,
+        )
+
+        logger.info(f"Update mem architecture for layer {self.layer}...")
+
+        # RemoveUnusedMemoryStage.visulize_modified_memory_structure(new_memory_hierarchy)
+
+        return new_accelerator
+
+    @staticmethod
+    def visulize_modified_memory_structure(new_memory_hierarchy):
+        # Visualization for debugging
+        from zigzag.visualization.graph.memory_hierarchy import (
+            visualize_memory_hierarchy_graph,
+        )
+
+        visualize_memory_hierarchy_graph(new_memory_hierarchy)
diff --git a/zigzag/classes/stages/SearchUnusedMemoryStage.py b/zigzag/classes/stages/SearchUnusedMemoryStage.py
new file mode 100644
index 00000000..190a2575
--- /dev/null
+++ b/zigzag/classes/stages/SearchUnusedMemoryStage.py
@@ -0,0 +1,501 @@
+from zigzag.classes.stages.Stage import Stage
+
+import networkx as nx
+from typing import Generator
+from zigzag.classes.workload.dummy_node import DummyNode
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+#################### Description ####################
+## This stage must be processed before WorkloadStage.
+## This stage figures out the unused memory levels for "I", "W", "O" when the size of lower memory level is enough to hold all data, considering the output data of previous layer can be directly used by next layer. As an impact, the energy / latency related to these memories will be removed.
+## The general criteria is:
+##      If a low-level memory size is big enough to hold both "I" and "O" data of current layer, memory above this one will be labeled as unused.
+##      If a low-level memory size is big enough to hold "W" data of entire workload, memory above this one will be labeled as unused.
+## The above method only applies layers along the same branch, otherwise (for branch starting nodes or branch final nodes) the "O" data will return back to the top possible memory.
+## In RemoveNoUseMemStage, unused mem across all layers, labeled in this stage, will be removed in the memory architecture.
+## For now, the number of cores must be 1.
+#################### Pseudo-code ####################
+## Initialization:
+##   mem_update_list = [layer_ids: {"I" / "O": -1}] ## mem level of different operands of each layer (there should be no -1 after self.update_top_mem_level())
+##   each_layer_IO_data_size = [layer_ids: {"I" / "O": size}] ## input / output data size of each layer
+##   mem_update_weight = top_mem_level ## top mem level to put weight
+##   weight_size_entire_workload = weight_size # weight data size of entire workload
+## Generate:
+##   layer_execution_order = list( topological_sort(layer_gragh) )
+## Locate top mem level for each operand of each layer. Store results in mem_update_list and mem_update_weight.
+##   for layer in all_layers:
+##     if layer.index != 0: ## not the 1st execution layer
+##       mem_udpate_list[layer]["I"] = mem_udpate_list[previous_layer]["O"]
+##     if len(layer.next_node) > 1 or len(next_layer.prevous_node) > 1: ## starting node of branches / final node of branches
+##     | if layer.index == 0:
+##     |   mem_update_list[layer]["I" / "O"] updates to the top input/output mem level
+##     | else:
+##     |   mem_update_list[layer]["O"] updates to the top output mem level
+##     |   mem_update_weight = top weight mem level, if mem_update_weight > top weight mem level
+##     |
+##     else:
+##       for mem in mem_levels(sort_order: from top to bottom):
+##         if sum(layer[operand_size] for operand in mem.operands) <= mem.size:
+##           if ["I", "O"] both in mem.operands:
+##             mem_update_list[layer]["O"] = current_mem_level
+##             if layer.index == 0: ## the 1st execution layer
+##               mem_update_list[layer]["I"] = current_mem_level
+##           if ("W" in mem.operand) and (current_mem_level < mem_update_weight):
+##             mem_update_weight = current_mem_level
+#####################################################
+#  Special note for Adder layers:
+#   Currently the algorithm is tricky for Adder layers. As for a conv/pool layer, required I, O sizes are put in
+#   each_layer_IO_data_size and the weight data size will be accumulated in weight_size_entire_workload.
+#   But for Adder layers, (1) there is no weight operand (or constant operand); (2) there are two input operands.
+#   (3) the info regarding which of the two operands is represented as I1 or I2 is not saved in self.workload,
+#   though it is defined in the input file.
+#   So, the current solution is:
+#   (1) for weight, the data amount is 0, which means weight_size_entire_workload will not consider Adder layers.
+#   (2) for act, we add up the data size of the two (or multiple) inputs and treat the sum as the act data size
+#   for the current layer, which is stored in each_layer_IO_data_size.
+#   What does this mean?
+#   This means for Adder layers, the required act data size is over-estimated, because we also include the data amount
+#   of the other operand, which we may have defined separate mem for the other operand.
+#   In other words, for a mem level with enough size to hold both O, I1
+#   (assume I1 is the mem representation for one input),
+#   may be thought by the code that the size is not enough and therefore the output cannot be stored at this level.
+#   But keep in mind that!!!!!:
+#   this is only a problem when you use manually-defined workload and there are Adder layers.
+#   there is no problem if your workload is an .onnx file, because Adder layers will be skipped by default.
+#   Is there a solution?
+#   The reason why it cannot be fixed is we do not know which operand is from which layer.
+#   This problem can be fixed unless this info granularity is saved in the self.workload object,
+#   which is a networkx graph.
+
+
+class SearchUnusedMemoryStage(Stage):
+    def __init__(self, list_of_callables, *, accelerator, workload, **kwargs):
+        super().__init__(list_of_callables, **kwargs)
+        self.accelerator = accelerator
+        self.workload = workload
+        ## Initialization
+        self.mem_update_list = {}
+        self.each_layer_IO_data_size = {}  # unit: bit
+        core_id = accelerator.cores[0].id  # correct only for single-core hardware
+        self.core_mem_level_list = accelerator.get_core(
+            core_id=core_id
+        ).memory_hierarchy.mem_level_list
+        self.mem_update_weight = (
+            len(self.core_mem_level_list) - 1
+        )  # index of the top memory
+        self.weight_size_entire_workload = 0  # unit: bit
+        self.layer_list = {}  # layer name and its corresponding id
+        core = accelerator.get_core(core_id=core_id)
+        for id, layer in enumerate(nx.topological_sort(workload)):
+            if (
+                type(layer) != DummyNode
+            ):  # create record on memory level, data size of each operand for un-dummy nodes
+                # identify the weight operand
+                if len(layer.constant_operands) == 1:
+                    weight_operand = layer.constant_operands[0]
+                else:
+                    if len(layer.constant_operands) == 0:
+                        # special case when defining workload manually:
+                        # the constant operands list is empty for such as "Adder" layers
+                        # for input operand, we will represent all inputs as one input, since only their data size is used for required mem size calculation.
+                        input_operand = layer.input_operands[0]
+                        output_operand = layer.output_operand
+                        input_data_size = 0
+                        for operand in layer.input_operands:
+                            input_data_size += layer.operand_size_bit[operand]
+                        self.mem_update_list[f"{id}"] = [
+                            {operand: -1}
+                            for operand in core.mem_hierarchy_dict.keys()
+                            if operand
+                            in [
+                                layer.memory_operand_links[output_operand],
+                                layer.memory_operand_links[input_operand],
+                            ]
+                        ]
+                        self.each_layer_IO_data_size[f"{id}"] = [
+                            {
+                                layer.memory_operand_links[
+                                    output_operand
+                                ]: layer.operand_size_bit[output_operand],
+                                layer.memory_operand_links[
+                                    input_operand
+                                ]: input_data_size,
+                            }
+                        ]
+                        self.layer_list[layer] = id
+                        continue
+                    else:
+                        # special case when defining workload manually:
+                        # both I and W are considered as constant operands for the first layer
+                        pr_loop_keys = tuple(layer.pr_loop.keys())
+                        for (
+                            operand,
+                            related_loop,
+                        ) in layer.operand_dimensionality_order.items():
+                            if pr_loop_keys[0] in related_loop:
+                                act_operand = operand
+                        weight_operand: list = [
+                            x for x in layer.constant_operands if x != act_operand
+                        ]
+                        assert len(weight_operand) == 1
+                        weight_operand: str = weight_operand[0]
+                self.mem_update_list[f"{id}"] = [
+                    {operand: -1}
+                    for operand in core.mem_hierarchy_dict.keys()
+                    if operand != layer.memory_operand_links[weight_operand]
+                ]
+                self.each_layer_IO_data_size[f"{id}"] = [
+                    {
+                        layer.memory_operand_links[operand]: layer.operand_size_bit[
+                            operand
+                        ]
+                        for operand in layer.memory_operand_links.keys()
+                        if operand != weight_operand
+                    }
+                ]
+                self.weight_size_entire_workload += layer.operand_size_bit[
+                    weight_operand
+                ]
+                self.layer_list[layer] = id
+
+    def run(self, workload_data_always_from_top_mem=False) -> Generator:
+        self.update_top_mem_level()  # figure out the lowest possible mem level for all operands for all layers
+
+        if workload_data_always_from_top_mem:
+            # [OPTIONAL] re-define the input/output mem level of first/last layer to the top possible mem level. This
+            # is specially designed for the case that workload input and output must be stored in the top mem level.
+            self.update_mem_level_for_loading_data()
+
+        sub_stage = self.list_of_callables[0](
+            self.list_of_callables[1:],
+            accelerator=self.accelerator,
+            workload=self.workload,
+            mem_update_list=self.mem_update_list,
+            mem_update_weight=self.mem_update_weight,
+            layer_list=self.layer_list,
+            **self.kwargs,
+        )
+        for cme, (layer, extra_info) in sub_stage.run():
+            yield cme, (layer, extra_info)
+
+    def update_top_mem_level(self):
+        """
+        Update mem_update_list and mem_update_weight according to the algorithm description at the file beginning.
+        """
+        self.remove_dummy_nodes_in_workload()  # remove dummy nodes for the ease of telling the branch starting or final nodes
+
+        ## Update mem_update_list and mem_update_weight
+        for id, layer in enumerate(nx.topological_sort(self.workload)):
+            branch_starting_node = (
+                True if self.workload.out_degree(layer) > 1 else False
+            )  # starting node of branches
+            branch_final_node = (
+                True
+                if self.workload.out_degree(layer) == 1
+                and self.workload.in_degree(list(self.workload.successors(layer))[0])
+                > 1
+                else False
+            )
+            output_operand = layer.memory_operand_links[
+                layer.output_operand
+            ]  # output representation in memory
+            curr_id = self.layer_list[
+                layer
+            ]  # current layer id (key) in mem_udpate_list
+            if len(layer.constant_operands) == 1:
+                const_operand = layer.memory_operand_links[
+                    layer.constant_operands[0]
+                ]  # weight representation in memory
+                act_operand = layer.memory_operand_links[
+                    [
+                        operand
+                        for operand in layer.input_operands
+                        if operand not in layer.constant_operands
+                    ][0]
+                ]  # act representation in memory
+            else:
+                if len(layer.constant_operands) == 0:
+                    # special case when defining workload manually:
+                    # the constant operands list is empty for such as "Adder" layers
+                    const_operand = None
+                    act_operand = layer.memory_operand_links[layer.input_operands[0]]
+                else:
+                    # special case when defining workload manually:
+                    # both I and W are considered as constant operands for the first layer
+                    pr_loop_keys = tuple(layer.pr_loop.keys())
+                    for (
+                        operand,
+                        related_loop,
+                    ) in layer.operand_dimensionality_order.items():
+                        if pr_loop_keys[0] in related_loop:
+                            act_operand = operand
+                    weight_operand: list = [
+                        x for x in layer.constant_operands if x != act_operand
+                    ]
+                    weight_operand: str = weight_operand[0]
+                    act_operand = layer.memory_operand_links[
+                        act_operand
+                    ]  # map from layer representation to hardware memory representation
+                    const_operand = layer.memory_operand_links[
+                        weight_operand
+                    ]  # weight representation in memory
+            if id != 0:  ## not the first layer
+                ## Assign mem_udpate_list[layer]["I"] = mem_udpate_list[previous_layer]["O"]
+                prev_layer = list(self.workload.predecessors(layer))[
+                    0
+                ]  # previous layer node (object)
+                prev_layer_id = self.layer_list[prev_layer]  # previous layer id
+                prev_layer_output_operand = (
+                    prev_layer.output_operand
+                )  # output representation in memory of previous layer
+                for ele in self.mem_update_list[
+                    f"{prev_layer_id}"
+                ]:  # find the output mem level of previous layer
+                    try:
+                        prev_layer_output_level = ele[f"{prev_layer_output_operand}"]
+                    except (
+                        KeyError
+                    ):  # skip if the key is incorrect, as there will only be one that match.
+                        pass
+                self.update_IO_mem_level(
+                    curr_id, act_operand, prev_layer_output_level
+                )  # update the input mem level of current layer
+            if id == 28:
+                pass
+            if (
+                branch_starting_node or branch_final_node
+            ):  ## branch starting node or branch final node or permited dummy nodes (e.g. Adder layer)
+                ## Update input, weight, output mem level for branch starting node and branch final node
+                ## Find the top mem level for input if it is the first layer, update mem_udpate_list of current layer
+                if id == 0:  ## the first layer
+                    for curr_mem_level, mem in reversed(
+                        list(enumerate(self.core_mem_level_list))
+                    ):
+                        served_operands = list(
+                            mem.mem_level_of_operands.keys()
+                        )  # Check the served operand of current mem
+                        if act_operand in served_operands:
+                            self.update_IO_mem_level(
+                                curr_id, act_operand, curr_mem_level
+                            )  # update the input mem level of current layer if it is the first layer
+                            break
+                ## Find the top mem level for output, update mem_update_list of current layer
+                for curr_mem_level, mem in reversed(
+                    list(enumerate(self.core_mem_level_list))
+                ):
+                    served_operands = list(
+                        mem.mem_level_of_operands.keys()
+                    )  # Check the served operand of current mem
+                    if output_operand in served_operands:
+                        self.update_IO_mem_level(
+                            curr_id, output_operand, curr_mem_level
+                        )  # update the output mem level of current layer
+                        break
+                ## Find the top mem level for weight, update mem_update_weight of current layer to the top weight mem level if mem_update_weight is bigger
+                for curr_mem_level, mem in reversed(
+                    list(enumerate(self.core_mem_level_list))
+                ):
+                    served_operands = list(
+                        mem.mem_level_of_operands.keys()
+                    )  # Check the served operand of current mem
+                    if (
+                        const_operand in served_operands
+                    ):  # identify the top weight mem level
+                        if (
+                            curr_mem_level < self.mem_update_weight
+                        ):  # mem_update_weight is bigger than the top weight mem level
+                            self.mem_update_weight = curr_mem_level
+                        break
+            else:  ## node (layer) that is not a branch starting node or a branch final node
+                ## Iterate the memory level and update input, weight, output mem level
+                for curr_mem_level, mem in reversed(
+                    list(enumerate(self.core_mem_level_list))
+                ):
+                    served_operands = list(
+                        mem.mem_level_of_operands.keys()
+                    )  # Check the served operand of current mem
+                    ## Update input, weight, output mem level
+                    avail_mem_size = (
+                        mem.memory_instance.size
+                    )  # available hardware mem size
+
+                    try:
+                        # we need to grab the next layer name, which is a non-Adder layer for sure
+                        # if next layer is an Adder layer, then branch_final_node=True for the current layer,
+                        # so, the simulation will not reach to this "else" branch.
+                        next_layer = list(self.workload.successors(layer))[0]
+                        # next, we find out the layer representation for the act operand of the next layer
+                        const_layer_operand_of_next_layer = (
+                            next_layer.constant_operands[0]
+                        )
+                        act_layer_operand_of_next_layer = [
+                            operand
+                            for operand in next_layer.input_operands
+                            if operand != const_layer_operand_of_next_layer
+                        ][0]
+                        # then, we will fetch the mem representation for the act operand of the next layer
+                        act_mem_operand_of_next_layer = next_layer.memory_operand_links[
+                            act_layer_operand_of_next_layer
+                        ]
+                        # check if the current mem level serve the act operand in the next layer
+                        mem_serve_act_in_next_layer = (
+                            True
+                            if (act_mem_operand_of_next_layer in served_operands)
+                            else False
+                        )
+                    except (
+                        IndexError
+                    ):  # there is no next layer, which means the current layer is the last layer
+                        # As for the last layer, we will instead check
+                        # if the mem serves act operand of the current layer.
+                        mem_serve_act_in_next_layer = (
+                            True if (act_operand in served_operands) else False
+                        )
+
+                    mem_serve_io_both = (
+                        True
+                        if mem_serve_act_in_next_layer
+                        and (output_operand in served_operands)
+                        else False
+                    )  # ["I", "O"] both in mem.served_operands
+                    mem_serve_weight = (
+                        True if (const_operand in served_operands) else False
+                    )  # mem.served_operands = ["W"]
+
+                    # we need to change served_operands if the current layer is an Adder layer,
+                    # for the ease of calculation of required input data size.
+                    # Since an Adder layer has two inputs,
+                    # but in each_layer_IO_data_size, data size of two inputs are put under one key,
+                    # so we have to update served_operands to ensure the key used in each_layer_IO_data_size is in it.
+                    if (
+                        len(layer.constant_operands) == 0 and mem_serve_io_both
+                    ):  # the layer type is an Adder layer, which has multiple input operands
+                        served_operands = [
+                            output_operand,
+                            layer.memory_operand_links[layer.input_operands[0]],
+                        ]
+
+                    if mem_serve_io_both or mem_serve_weight:
+                        required_IO_data_size = sum(
+                            [
+                                self.each_layer_IO_data_size[f"{curr_id}"][0][operand]
+                                for operand in served_operands
+                                if operand != const_operand
+                            ]
+                        )
+                        required_weight_size = (
+                            self.weight_size_entire_workload
+                            if const_operand in served_operands
+                            else 0
+                        )
+                        required_total_size = (
+                            required_IO_data_size + required_weight_size
+                        )  # required size to put data in current mem level
+                        if (
+                            required_total_size <= avail_mem_size
+                        ):  # sum(layer[operand_size] for operand in mem.operands) <= mem.size
+                            if mem_serve_io_both:
+                                if id == 0:
+                                    self.update_IO_mem_level(
+                                        curr_id, act_operand, curr_mem_level
+                                    )  # update input mem level
+                                self.update_IO_mem_level(
+                                    curr_id, output_operand, curr_mem_level
+                                )  # update output mem level
+                            if (
+                                curr_mem_level < self.mem_update_weight
+                            ) and mem_serve_weight:  # update weight mem level
+                                self.mem_update_weight = curr_mem_level
+        ## [OPTIONAL CHECK] assert check if there is -1 value in mem_update_list
+        ## [NOTE] Until here, if there is still -1 value in mem_update_list, it means the size of top mem level for IO is not big enough.
+        for layer_ele in self.mem_update_list.values():
+            for operand_dict in layer_ele:
+                assert (
+                    list(operand_dict.values())[0] >= 0
+                ), "SearchUnusedMemoryStage fisnishes abnormally, there are still layers with top mem levels not figured out."
+
+    def update_mem_level_for_loading_data(self):
+        """
+        [OPTIONAL FUNCTION] This is an optional function.
+        Depending on your requirement, sometimes data loading from the top mem level and offloading to the top mem level is a must.
+        If that is the your case, add this function to self.run().
+        Otherwise, if the input is generated on-chip at the lowest possible input mem level and the output is stored on-chip at the lowest possible output mem level, remove this function from self.run().
+        [FUNCTION OBJECT]
+        Update mem_update_list of first and last layer, so that the input data of first layer still is loaded from top input mem level and the output of last layer still is offloaded to top output mem level
+        """
+        self.remove_dummy_nodes_in_workload()  # remove dummy nodes for the ease of telling the branch starting or final nodes
+
+        ## Update mem_update_list and mem_update_weight
+        for id, layer in enumerate(nx.topological_sort(self.workload)):
+            act_operand = layer.memory_operand_links[
+                [
+                    operand
+                    for operand in layer.input_operands
+                    if operand not in layer.constant_operands
+                ][0]
+            ]  # act representation
+            output_operand = layer.output_operand  # output representation
+            curr_id = self.layer_list[
+                layer
+            ]  # current layer id (key) in mem_udpate_list
+            if (
+                id == 0
+            ):  # the first layer: update activation mem level to the top possible mem level
+                for curr_mem_level, mem in reversed(
+                    list(enumerate(self.core_mem_level_list))
+                ):
+                    served_operands = list(
+                        mem.mem_level_of_operands.keys()
+                    )  # Check the served operand of current mem
+                    if act_operand in served_operands:
+                        self.update_IO_mem_level(
+                            curr_id, act_operand, curr_mem_level
+                        )  # update the input mem level of current layer if it is the first layer
+                        break
+            if (
+                id == len(self.layer_list) - 1
+            ):  # the last layer: update output mem level to the top possible mem level
+                for curr_mem_level, mem in reversed(
+                    list(enumerate(self.core_mem_level_list))
+                ):
+                    served_operands = list(
+                        mem.mem_level_of_operands.keys()
+                    )  # Check the served operand of current mem
+                    if output_operand in served_operands:
+                        self.update_IO_mem_level(
+                            curr_id, output_operand, curr_mem_level
+                        )  # update the output mem level of current layer if it is the last layer
+                        break
+
+    def remove_dummy_nodes_in_workload(self):
+        ## Remove dummy nodes (layers) in the graph (assume there is no branch from a non-dummy node to dummy node)
+        ## Redirect the outgoing edges of dummy nodes to non-dummy nodes
+        ## Algorithm:
+        ## for each dummy node, add edges between its predecessor nodes and successor nodes; then remove the dummy node.
+        #############################################
+        ## Comment on the following 4 lines below: visualize the network for debugging
+        ## import matplotlib.pyplot as plt
+        ## pos = nx.spring_layout(self.workload)
+        ## nx.draw(self.workload, pos, with_labels=True, node_color="lightblue", font_weight="bold")
+        ## plt.show()
+        #############################################
+        dummy_nodes = [
+            node for node in self.workload.nodes() if type(node) == DummyNode
+        ]
+        for dummy_node in dummy_nodes:
+            for successor_node in list(self.workload.successors(dummy_node)):
+                for predecessor_node in list(self.workload.predecessors(dummy_node)):
+                    self.workload.add_edge(predecessor_node, successor_node)
+        self.workload.remove_nodes_from(dummy_nodes)
+
+    def update_IO_mem_level(self, layer_id, operand, target_level):
+        """
+        Update self.mem_update_list as:
+        self.mem_update_list[layer_id][operand_index][operand] = target_level
+        """
+        for pos, ele in enumerate(self.mem_update_list[f"{layer_id}"]):
+            if list(ele.keys())[0] == f"{operand}":
+                self.mem_update_list[f"{layer_id}"][pos][f"{operand}"] = target_level
diff --git a/zigzag/classes/stages/__init__.py b/zigzag/classes/stages/__init__.py
index efe948b0..fdf696a7 100644
--- a/zigzag/classes/stages/__init__.py
+++ b/zigzag/classes/stages/__init__.py
@@ -26,6 +26,8 @@
 from .Stage import Stage, MainStage
 from .TemporalOrderingConversionStage import TemporalOrderingConversionStage
 from .WorkloadStage import WorkloadStage
+from .RemoveUnusedMemoryStage import RemoveUnusedMemoryStage
+from .SearchUnusedMemoryStage import SearchUnusedMemoryStage
 
 # Parameter providers: these parameters are provided to substages by the following classes:
 #  - accelerator: AcceleratorParserStage