diff --git a/.gitignore b/.gitignore
index 08983801..89807f12 100644
--- a/.gitignore
+++ b/.gitignore
@@ -149,4 +149,7 @@ docs/Makefile
 docs/make.bat
 
 # documentation output
-html/
\ No newline at end of file
+html/
+
+# debug file
+debug*
\ No newline at end of file
diff --git a/tests/main/test_with_mix_spatial_mapping/test_edge_tpu_like.py b/tests/main/test_with_mix_spatial_mapping/test_edge_tpu_like.py
index 86acd714..8287ba69 100644
--- a/tests/main/test_with_mix_spatial_mapping/test_edge_tpu_like.py
+++ b/tests/main/test_with_mix_spatial_mapping/test_edge_tpu_like.py
@@ -13,10 +13,10 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (5582430184.085, 8343378),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (762066732.5049998, 3003074),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1743190534.155, 5305825),
-    "zigzag.inputs.examples.workload.resnet18": (2087322696.315, 6155355),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5582059481.445, 8343378),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (819971935.77, 2430583),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1763135800.67, 5001291),
+    "zigzag.inputs.examples.workload.resnet18": (2090252961.0700002, 5858437),
 }
 
 
diff --git a/tests/main/test_with_mix_spatial_mapping/test_meta_prototype_like.py b/tests/main/test_with_mix_spatial_mapping/test_meta_prototype_like.py
index ff7ea9a8..c002b8d9 100644
--- a/tests/main/test_with_mix_spatial_mapping/test_meta_prototype_like.py
+++ b/tests/main/test_with_mix_spatial_mapping/test_meta_prototype_like.py
@@ -13,10 +13,10 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (5681909351.240001, 8299150),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (919452681.2249999, 2894129),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1789888904.4450002, 3472280),
-    "zigzag.inputs.examples.workload.resnet18": (2348207081.7949996, 4238517),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5679695605, 8299150),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (901092009, 2610609),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1730672410, 3262009),
+    "zigzag.inputs.examples.workload.resnet18": (2265438430, 4017227),
 }
 
 
diff --git a/tests/main/test_with_mix_spatial_mapping/test_tesla_npu_like.py b/tests/main/test_with_mix_spatial_mapping/test_tesla_npu_like.py
index 682604d4..c4b0c5e6 100644
--- a/tests/main/test_with_mix_spatial_mapping/test_tesla_npu_like.py
+++ b/tests/main/test_with_mix_spatial_mapping/test_tesla_npu_like.py
@@ -13,10 +13,10 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (6040086796.366001, 8389669),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (958401881.3470002, 1964453),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681.4799998, 3257898),
-    "zigzag.inputs.examples.workload.resnet18": (2220861655.6660004, 3934616),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (6044768678, 8370470),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (930702060, 1965457),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681, 3257898),
+    "zigzag.inputs.examples.workload.resnet18": (2220861655, 3934616),
 }
 
 
diff --git a/zigzag/classes/cost_model/cost_model.py b/zigzag/classes/cost_model/cost_model.py
index cf684c0c..5369dfd1 100644
--- a/zigzag/classes/cost_model/cost_model.py
+++ b/zigzag/classes/cost_model/cost_model.py
@@ -211,12 +211,14 @@ def __init__(
         accelerator,
         layer,
         spatial_mapping,
+        spatial_mapping_int,
         temporal_mapping,
         access_same_data_considered_as_no_access=True,
     ):
         self.accelerator = accelerator
         self.layer = layer
         self.spatial_mapping = spatial_mapping
+        self.spatial_mapping_int = spatial_mapping_int  # the original spatial mapping without decimal
         self.temporal_mapping = temporal_mapping
         self.access_same_data_considered_as_no_access = (
             access_same_data_considered_as_no_access
@@ -246,9 +248,10 @@ def __init__(
 
         """ generate the integer spatial mapping from fractional spatial mapping (due to greedy mapping support).
         Later the fractional one is used for calculating energy, and the integer one is used for calculating latency"""
-        self.spatial_mapping_dict_int = spatial_mapping_fractional_to_int(
-            self.spatial_mapping.mapping_dict_origin
-        )
+        # self.spatial_mapping_dict_int = spatial_mapping_fractional_to_int(
+        #     self.spatial_mapping.mapping_dict_origin
+        # )
+        self.spatial_mapping_dict_int = self.spatial_mapping_int.mapping_dict_origin
 
         # For constructing Mapping object,  the last parameter "self.access_same_data_considered_as_no_access" is optional
         self.mapping = Mapping(
diff --git a/zigzag/classes/opt/spatial/generator.py b/zigzag/classes/opt/spatial/generator.py
index e10453eb..4a7d4ddc 100644
--- a/zigzag/classes/opt/spatial/generator.py
+++ b/zigzag/classes/opt/spatial/generator.py
@@ -6,6 +6,8 @@
 from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy
 from zigzag.classes.hardware.architecture.operational_array import OperationalArray
 
+import math
+
 
 ## Class that generates valid user-format spatial mappings.
 class UserSpatialMappingGenerator:
@@ -19,6 +21,7 @@ def __init__(
         defined_mapping=None,
         enable_mix_spatial_mapping_generation=False,
         maximize_hardware_utilization=True,
+        enable_weight_diagonal_mapping=False,
     ) -> None:
         self.layer = layer
         self.accelerator = accelerator
@@ -27,11 +30,13 @@ def __init__(
             enable_mix_spatial_mapping_generation
         )
         self.maximize_hardware_utilization = maximize_hardware_utilization
+        self.enable_weight_diagonal_mapping = enable_weight_diagonal_mapping
 
     def run(self):
         return self.generate_user_spatial_mappings(
             enable_mix_spatial_mapping_generation=self.enable_mix_spatial_mapping_generation,
             maximize_hardware_utilization=self.maximize_hardware_utilization,
+            enable_weight_diagonal_mapping=self.enable_weight_diagonal_mapping,
         )
 
     ## Generator that yields user-defined spatial mappings.
@@ -51,7 +56,10 @@ def run(self):
     #                   layer_dim can be unrolled if the BW allows it (assumes flexible "bus" reads)
     # \endcode
     def generate_user_spatial_mappings(
-        self, enable_mix_spatial_mapping_generation, maximize_hardware_utilization
+        self,
+        enable_mix_spatial_mapping_generation,
+        maximize_hardware_utilization,
+        enable_weight_diagonal_mapping,
     ):
         core_id = self.layer.core_allocation
         core: Core = self.accelerator.get_core(core_id=core_id)
@@ -126,7 +134,28 @@ def generate_user_spatial_mappings(
                 defined_mapping is not None
                 and defined_mapping.get(oa_dim.name) is not None
             ):
-                oa_dim_unrollings = [defined_mapping.get(oa_dim.name)]
+                # scale down the defined_mapping size if it exceeds the layer dim size
+                ori_loop = defined_mapping.get(oa_dim.name)
+                loop_to_reform = []
+                if self.is_nested_tuple(ori_loop):  # mix sm loop
+                    for sub_loop in ori_loop:
+                        sub_loop_dim = sub_loop[0]
+                        sub_loop_size = sub_loop[1]
+                        if sub_loop_dim in self.layer.loop_dim_size.keys():
+                            if sub_loop_size > self.layer.loop_dim_size[sub_loop_dim]:
+                                sub_loop_size = self.layer.loop_dim_size[sub_loop_dim]
+                            loop_to_reform.append((sub_loop_dim, sub_loop_size))
+                else:  # single layer sm loop
+                    loop_dim = ori_loop[0]
+                    loop_size = ori_loop[1]
+                    if loop_dim in self.layer.loop_dim_size.keys():
+                        if loop_size > self.layer.loop_dim_size[loop_dim]:
+                            loop_size = self.layer.loop_dim_size[loop_dim]
+                        loop_to_reform.append((loop_dim, loop_size))
+                loop_to_reform = tuple(loop_to_reform)
+                if len(loop_to_reform) == 0:
+                    loop_to_reform = None
+                oa_dim_unrollings = [loop_to_reform]
             else:
                 oa_dim_unrollings = []
                 oa_dim_unrolling_hints = user_spatial_mapping_hint[oa_dim.name]
@@ -161,20 +190,6 @@ def generate_user_spatial_mappings(
                     ) = self.sort_oa_dim_unrollings_in_the_order_of_utilization(
                         oa_dim_unrollings, descending=True
                     )
-                    if len(oa_dim_unrollings) > 0:  # oa_dim_unrollings is not []
-                        # Then only keep the combs in oa_dim_unrollings that have the highest oa_dim mapping utilization
-                        # The closer to the front, the higher the oa_dim utilization rate.
-                        updated_oa_dim_unrollings = [oa_dim_unrollings[0]]
-                        # Check if there are other sm loops that has the same utilization with the highest one.
-                        for i in range(1, len(hardware_utilization)):
-                            if hardware_utilization[i] == hardware_utilization[0]:
-                                updated_oa_dim_unrollings.append(oa_dim_unrollings[i])
-                        # [Optional] To reduce the simulation time, when there are still too many spatial unrollings,
-                        # We keep only the first two unrollings for each oa_dim.
-                        # You can comment out the next two lines if you want to check all spatial unrollings.
-                        if len(updated_oa_dim_unrollings) > 2:
-                            updated_oa_dim_unrollings = updated_oa_dim_unrollings[0:2]
-                        oa_dim_unrollings = updated_oa_dim_unrollings
 
                 # In case there are no unrollings (of size > 1) possible, add a single unrolling of size 1.
                 # The loop dimension we pick is randomly chosen as the first loop dimension in the layer.
@@ -186,33 +201,168 @@ def generate_user_spatial_mappings(
 
         # Now we have for each operational array dimension the layer dimensions and size they can be unrolled without fractional remainder.
         # Now we have to combine them into user-defined spatial mappings.
+        # record down the number of yield
+        yield_count = 0
+        yield_count_limit = 2  # used to control the yield count when maximize_hardware_utilization == True
         for combination in itertools.product(*unrollings):
+            if maximize_hardware_utilization and yield_count >= yield_count_limit:
+                # 2 means: only check the top 2 spatial mapping with the highest hardware utilization
+                # Modify "2" to other numbers if you want to check on more spatial mappings.
+                break
+
+            legal_spatial_loop, left_layer_dim_size = self.check_spatial_loop_legality(
+                combination=combination, layer=self.layer
+            )
+            if not legal_spatial_loop:
+                continue
             # Zip the combination (which is a (layer_dim, layer_size) for each oa_dim with the oa_dim names.
             oa_dim_names = [oa_dim.name for oa_dim in oa_dims]
-            # Extra check on the total unrolling size of a layer dim, if it is mapped on >=2 dimensions.
-            combination_check = {
-                layer_dim: layer_size
-                for layer_dim, layer_size in self.layer.loop_dim_size.items()
+
+            user_spatial_mapping = {
+                oa_dim_name: unrolling
+                for (oa_dim_name, unrolling) in zip(oa_dim_names, combination)
+                if unrolling is not None
             }
-            for unrolling_in_combination in combination:
-                if unrolling_in_combination is None:
+            # Add act ir loop if it is weight stationary and the innermost memories serve for act.
+            if enable_weight_diagonal_mapping:
+                user_spatial_mapping = self.add_input_pr_spatial_loop_if_enabled(
+                    layer=self.layer,
+                    provided_user_spatial_mapping=user_spatial_mapping,
+                    user_spatial_mapping_hint=user_spatial_mapping_hint,
+                    innermost_levels=innermost_levels,
+                    left_layer_dim_size=left_layer_dim_size,
+                    enable_mix_spatial_mapping_generation=enable_mix_spatial_mapping_generation,
+                )
+            yield user_spatial_mapping
+            yield_count += 1
+        # If yield_count==0, it means there is no legal spatial mapping found.
+        # One reason is that the spatial mapping provided by the user has exceeded the layer dim size,
+        # therefore the loop cannot pass the check.
+        # The other reason could be: there is a layer dim mapped on multiple oa dims,
+        # so the product has exceeded the layer dim size.
+        # For a quick fix on the second cause, we will reform the sm loop only for single layer dim mapping.
+        if yield_count == 0:
+            for combination in itertools.product(*unrollings):
+                is_mix_comb = False
+                for loop in combination:
+                    if self.is_nested_tuple(loop):
+                        is_mix_comb = True
+                        continue
+                if is_mix_comb:
+                    # The fix is not applied for mix sm loop.
                     continue
-                if self.is_nested_tuple(unrolling_in_combination):
-                    for sub_unrolling_in_combination in unrolling_in_combination:
-                        unrolling_layer_dim = sub_unrolling_in_combination[0]
-                        unrolling_layer_size = sub_unrolling_in_combination[1]
-                        if unrolling_layer_dim in combination_check.keys():
-                            combination_check[
-                                unrolling_layer_dim
-                            ] /= unrolling_layer_size
+                if maximize_hardware_utilization and yield_count >= yield_count_limit:
+                    # 2 means: only check the top 2 spatial mapping with the highest hardware utilization
+                    # Modify "2" to other numbers if you want to check on more spatial mappings.
+                    break
+                (
+                    new_combination,
+                    left_layer_dim_size,
+                ) = self.shrink_combination_when_a_layer_dim_is_mapped_on_multiple_oa_dims(
+                    combination=combination,
+                    layer=self.layer,
+                )
+                # Zip the combination (which is a (layer_dim, layer_size) for each oa_dim with the oa_dim names.
+                oa_dim_names = [oa_dim.name for oa_dim in oa_dims]
+
+                user_spatial_mapping = {
+                    oa_dim_name: unrolling
+                    for (oa_dim_name, unrolling) in zip(oa_dim_names, new_combination)
+                    if unrolling is not None
+                }
+                # Add act ir loop if it is weight stationary and the innermost memories serve for act.
+                if enable_weight_diagonal_mapping:
+                    user_spatial_mapping = self.add_input_pr_spatial_loop_if_enabled(
+                        layer=self.layer,
+                        provided_user_spatial_mapping=user_spatial_mapping,
+                        user_spatial_mapping_hint=user_spatial_mapping_hint,
+                        innermost_levels=innermost_levels,
+                        left_layer_dim_size=left_layer_dim_size,
+                        enable_mix_spatial_mapping_generation=enable_mix_spatial_mapping_generation,
+                    )
+                yield user_spatial_mapping
+                yield_count += 1
+
+        assert (
+            yield_count > 0
+        ), "There is no legal spatial mapping found. Please make sure the provided spatial mappings do not exceed the layer dimension size."
+
+    def shrink_combination_when_a_layer_dim_is_mapped_on_multiple_oa_dims(
+        self, combination, layer
+    ):
+        new_combination = combination
+        legal_spatial_loop, left_layer_dim_size = self.check_spatial_loop_legality(
+            combination=new_combination, layer=layer
+        )
+        while not legal_spatial_loop:
+            new_combination_next = list(new_combination)
+            for layer_dim, layer_dim_size in left_layer_dim_size.items():
+                if layer_dim_size < 1:
+                    scaled_success = False
+                    for oa_index in range(
+                        len(new_combination_next) - 1, -1, -1
+                    ):  # reverse order on oa dims
+                        (
+                            mapped_layer_dim,
+                            mapped_layer_dim_size,
+                        ) = new_combination_next[oa_index]
+                        if mapped_layer_dim_size > 1:
+                            # shrink the mapped layer dim size
+                            mapped_layer_dim_size -= 1
+                            new_combination_next[oa_index] = (
+                                mapped_layer_dim,
+                                mapped_layer_dim_size,
+                            )
+                            scaled_success = True
+                            break
                         else:
-                            # The unrolled layer dim does not exist in current layer.
-                            # This only happens when the spatial mapping is user-defined, which
-                            # contains non-existent layer dims in current layer.
+                            # because a layer can be mapped on multiple oa dims, we will move to the next oa dim.
                             pass
-                else:
-                    unrolling_layer_dim = unrolling_in_combination[0]
-                    unrolling_layer_size = unrolling_in_combination[1]
+                    # assert: if not scaled_success,
+                    # it means the sm loop cannot pass the check, even though all mapped size on this layer dim is 1
+                    assert scaled_success, (
+                        f"The spatial loop cannot meet the current hardware dimension after scaling, "
+                        f"Current spatial loop: {new_combination}"
+                    )
+            new_combination_next = tuple(new_combination_next)
+            # Next we will judge if new_combination_next is a legal loop
+            # If it is, then we will keep the current combination, rather than new_combination_next,
+            # the reason is: new_combination can cover the entire layer dim, but new_combination_next is smaller than
+            # the layer dim, therefore the actual sm loop for the layer dim is a decimal number.
+            # In that case, we will ceil it up to mimic the real case on hardware.
+            (
+                legal_spatial_loop,
+                left_layer_dim_size_next,
+            ) = self.check_spatial_loop_legality(
+                combination=new_combination_next, layer=layer
+            )
+            if not legal_spatial_loop:
+                new_combination = new_combination_next
+                left_layer_dim_size = left_layer_dim_size_next
+            else:
+                for layer_dim, layer_dim_size in left_layer_dim_size.items():
+                    # A special case when we will use new_combination_next when legal_spatial_loop == True
+                    # This case is when new_combination_next exactly match the layer dim size (left size == 1)
+                    if layer_dim_size < 1 and left_layer_dim_size_next[layer_dim] == 1:
+                        new_combination = new_combination_next
+                        left_layer_dim_size = left_layer_dim_size_next
+                        break
+        return new_combination, left_layer_dim_size
+
+    def check_spatial_loop_legality(self, combination, layer):
+        # Extra check on the total unrolling size of a layer dim, if it is mapped on >=2 dimensions.
+        combination_check = {
+            layer_dim: layer_size
+            for layer_dim, layer_size in layer.loop_dim_size.items()
+        }
+        legal_spatial_loop = True  # initialization
+        for unrolling_in_combination in combination:
+            if unrolling_in_combination is None:
+                continue
+            if self.is_nested_tuple(unrolling_in_combination):
+                for sub_unrolling_in_combination in unrolling_in_combination:
+                    unrolling_layer_dim = sub_unrolling_in_combination[0]
+                    unrolling_layer_size = sub_unrolling_in_combination[1]
                     if unrolling_layer_dim in combination_check.keys():
                         combination_check[unrolling_layer_dim] /= unrolling_layer_size
                     else:
@@ -220,25 +370,28 @@ def generate_user_spatial_mappings(
                         # This only happens when the spatial mapping is user-defined, which
                         # contains non-existent layer dims in current layer.
                         pass
-            for layer_dim, layer_size in combination_check.items():
-                if layer_size < 1:  # the layer size/the unrolling size < 1
-                    # It means the unrolling size > the layer size, which is incorrect and impossible.
-                    continue
-
-            user_spatial_mapping = {
-                oa_dim_name: unrolling
-                for (oa_dim_name, unrolling) in zip(oa_dim_names, combination)
-                if unrolling is not None
-            }
-            yield user_spatial_mapping
+            else:
+                unrolling_layer_dim = unrolling_in_combination[0]
+                unrolling_layer_size = unrolling_in_combination[1]
+                if unrolling_layer_dim in combination_check.keys():
+                    combination_check[unrolling_layer_dim] /= unrolling_layer_size
+                else:
+                    # The unrolled layer dim does not exist in current layer.
+                    # This only happens when the spatial mapping is user-defined, which
+                    # contains non-existent layer dims in current layer.
+                    pass
+        for layer_dim, layer_size in combination_check.items():
+            if layer_size < 1:  # the layer size/the unrolling size < 1
+                # It means the unrolling size > the layer size, which is incorrect and impossible.
+                legal_spatial_loop = False
+                break
+        return legal_spatial_loop, combination_check
 
     def append_mix_spatial_unrollings(
         self, provided_oa_dim_unrollings, provided_oa_dim_unrolling_hints, oa_dim
     ):
         # Create and append new mix spatial unrollings to original oa_dim_unrollings
         # An example of mix: (("K",2), ("OX", 2))
-        import math
-
         oa_dim_unrollings = provided_oa_dim_unrollings
         oa_dim_unrolling_hints = provided_oa_dim_unrolling_hints
         if (
@@ -384,8 +537,6 @@ def sort_oa_dim_unrollings_in_the_order_of_utilization(
         # @param descending:
         #                 True -- the higher the mapping utilization is, the closer to the front it is.
         #                 False -- the lower the mapping utilization is, the closer to the front it is.
-        import math
-
         oa_dim_unrollings = provided_oa_dim_unrollings
         if len(oa_dim_unrollings) > 1:
             # First we will record down the hardware utilization of each spatial unrolling in comb_value
@@ -423,6 +574,311 @@ def sort_oa_dim_unrollings_in_the_order_of_utilization(
             hardware_utilization = None
         return oa_dim_unrollings, hardware_utilization
 
+    def add_input_pr_spatial_loop_if_enabled(
+        self,
+        layer,
+        provided_user_spatial_mapping,
+        user_spatial_mapping_hint,
+        innermost_levels,
+        left_layer_dim_size,
+        enable_mix_spatial_mapping_generation,
+    ):
+        # This function is used to support diagonal spatial mapping
+        # when input/activation is served in the innermost memories and the weight is stationary.
+        user_spatial_mapping = provided_user_spatial_mapping
+        # get the link from layer op to mem op
+        layer_op_to_mem_op: dict = layer.memory_operand_links
+        # check if it is weight stationary.
+        # keep the spatial loop as it was if it is not weight stationary.
+        if len(layer.constant_operands) > 1:
+            return user_spatial_mapping
+        # get weight operand name
+        const_operand = layer.constant_operands[0]  # weight representation
+        # get activation operand name
+        act_operand = [
+            operand for operand in layer.input_operands if operand != const_operand
+        ][0]
+        # get output operand name
+        output_operand = layer.output_operand
+        # get name of OX, OY (weight ir layer dims)
+        weight_ir_layer_dims: list = layer.operand_loop_dim[const_operand]["ir"]
+        # get the oa_dim name served by input / output innermost memory level
+        for memory_level in innermost_levels:
+            mem_ops = memory_level.operands
+            if layer_op_to_mem_op[act_operand] in mem_ops:
+                act_served_oa_dim: set = memory_level.served_dimensions
+            if layer_op_to_mem_op[output_operand] in mem_ops:
+                output_served_oa_dim: set = memory_level.served_dimensions
+        # check if act is not served in the innermost memories, or act/output is not multicasting on only one dimension.
+        # keep the spatial loop as it was if act is not served.
+        if "act_served_oa_dim" not in locals() or len(act_served_oa_dim) != 1:
+            return user_spatial_mapping
+        if "output_served_oa_dim" not in locals() or len(output_served_oa_dim) != 1:
+            return user_spatial_mapping
+
+        act_served_oa_dim_name = list(act_served_oa_dim)[0].name
+        output_served_oa_dim_name = list(output_served_oa_dim)[0].name
+        act_served_oa_dim_size = list(act_served_oa_dim)[0].size
+        output_served_oa_dim_size = list(output_served_oa_dim)[0].size
+
+        # check if OX / OY in user_spatial_mapping_hint. Or else target_layer_dim will be empty.
+        target_layer_dim = []  # OX or OY or both
+        for layer_dim in weight_ir_layer_dims:
+            if layer_dim in user_spatial_mapping_hint[act_served_oa_dim_name]:
+                target_layer_dim.append(layer_dim)
+
+        # no further execution if OX / OY unrolling is not in user_spatial_mapping_hint
+        if len(target_layer_dim) == 0:
+            return user_spatial_mapping
+
+        ############################################
+        # Get existed mapping size on act_served_oa_dim, which will be added with OX, OY later.
+        if (
+            act_served_oa_dim_name in user_spatial_mapping.keys()
+        ):  # there already is sm loop
+            sm_loop = user_spatial_mapping[act_served_oa_dim_name]
+            if self.is_nested_tuple(sm_loop):  # a mix layer sm mapping
+                exist_act_loop_size = 1
+                for element in sm_loop:
+                    exist_act_loop_size *= element[1]
+            else:  # a single layer sm mapping
+                exist_act_loop_size = sm_loop[1]
+        else:  # there is no sm loop mapped on act served dim
+            exist_act_loop_size = 1
+
+        # Check if the existed mapping size is more than half of current oa dim size.
+        # If so, it means there is no space for extra mapping even with a size of 2.
+        # In that case, we will do nothing but return the orignal spatial mapping
+        if exist_act_loop_size * 2 > act_served_oa_dim_size:
+            return user_spatial_mapping
+
+        # fetch pr loop pairs for activation, e.g. {"IX": ["OX", "FX"]}
+        act_pr_layer_dims: dict = layer.operand_loop_dim[act_operand]["pr"]
+
+        # Next we get existed mapping size on output_served_oa_dim
+        # there are two classes of mapping:
+        # (1) ir mapping to weight, e.g. "C"
+        # (2) r mapping to weight, e.g. "FX", "FY" (kernel size)
+
+        # We firstly create a dict for later recording down existed r mapping to weight
+        # it will be like:
+        # weight_r_loop = {"OX": {"FX": 1}, "OY": {"FY": 1}}
+        weight_r_loop: dict = {}  # here we put a nested dict for recording
+        loops_name_for_kernel_size: list = []
+        pr_sm_link: dict = (
+            {}
+        )  # here we record down the link between pr loops, e.g. link["FX"]="OX"
+
+        for weight_ir_layer_dim in weight_ir_layer_dims:
+            for [layer_dim1, layer_dim2] in act_pr_layer_dims.values():
+                if weight_ir_layer_dim in [layer_dim1, layer_dim2]:
+                    break
+            # as we are unsure in act_pr_layer_dims, it is [OX, FX] or [FX, OX], we consider two possibilities.
+            if layer_dim1 == weight_ir_layer_dim:  # if the first one is OX / OY
+                weight_r_loop[layer_dim1] = {layer_dim2: 1}  # 1 by default
+                loops_name_for_kernel_size.append(layer_dim2)
+                pr_sm_link[layer_dim2] = layer_dim1
+            else:  # layer_dim2 == weight_ir_layer_dim, the second one is OX / OY
+                weight_r_loop[layer_dim2] = {layer_dim1: 1}  # 1 by default
+                loops_name_for_kernel_size.append(layer_dim1)
+                pr_sm_link[layer_dim1] = layer_dim2
+
+        # Next we will update the dict, and also find the mapping size (weight ir loop size) we do not care out.
+        weight_ir_loop_size = 1  # default value
+        sm_loop = user_spatial_mapping[output_served_oa_dim_name]
+        if self.is_nested_tuple(sm_loop):  # a mix sm mapping
+            for element in sm_loop:
+                # save operation as above
+                layer_dim = element[0]
+                mapping_size = element[1]
+                if layer_dim in loops_name_for_kernel_size:  # layer_dim in ["FX", "FY"]
+                    paired_pr_layer_dim = pr_sm_link[
+                        layer_dim
+                    ]  # "FX" -> "OX", "FY" -> "OY"
+                    weight_r_loop[paired_pr_layer_dim][layer_dim] *= mapping_size
+                else:  # not care
+                    weight_ir_loop_size *= mapping_size
+        else:  # a single layer sm mapping
+            layer_dim = sm_loop[0]
+            mapping_size = sm_loop[1]
+            if layer_dim in loops_name_for_kernel_size:  # layer_dim in ["FX", "FY"]
+                paired_pr_layer_dim = pr_sm_link[
+                    layer_dim
+                ]  # "FX" -> "OX", "FY" -> "OY"
+                weight_r_loop[paired_pr_layer_dim][layer_dim] *= mapping_size
+            else:  # not care
+                weight_ir_loop_size *= mapping_size
+
+        # At this point, we already know what sm mapping existed.
+        ############################################
+
+        # Next we will try to add possible OX / OY mapping
+        # find all possible OX / OY mapping breakdown and put them in the pool
+        # It looks like:
+        # sm_pools = {"OX": [("OX",2),("OX",5),("OX",5)], "OY": [("OY",2),("OY",5),("OY",5)]}
+        sm_pools_to_add: dict = {}
+        for layer_dim in target_layer_dim:
+            layer_size = self.layer.loop_dim_size[layer_dim]
+            layer_size_breakdown: list = self.prime_factors(layer_size)
+
+            # try to find the maximum OX / OY and add it to the list
+            # (1) check on act_served_oa_dim (ceil down to integer)
+            max_allowed_dim_size_on_act_served_dim = math.floor(
+                act_served_oa_dim_size / exist_act_loop_size
+            )
+            # (2) check on output_served_oa_dim
+            existed_pr_mapping = list(weight_r_loop[layer_dim].values())[0]
+            for key in weight_r_loop.keys():
+                if key != layer_dim:
+                    ir_layer_dim_to_current_layer_dim = key
+            existed_pr_mapping_but_ir_to_current_layer_dim = list(
+                weight_r_loop[ir_layer_dim_to_current_layer_dim].values()
+            )[0]
+            max_allowed_dim_size_on_output_served_dim = (
+                output_served_oa_dim_size
+                / weight_ir_loop_size
+                / existed_pr_mapping_but_ir_to_current_layer_dim
+            ) - (existed_pr_mapping - 1)
+            # ceil down to integer
+            max_allowed_dim_size_on_output_served_dim = math.floor(
+                max_allowed_dim_size_on_output_served_dim
+            )
+            max_allowed_target_dim_size = min(
+                max_allowed_dim_size_on_act_served_dim,
+                max_allowed_dim_size_on_output_served_dim,
+            )
+            # check whether the element in layer_size_breakdown is allowed to add
+            legal_layer_size_breakdown = []
+            for factor in layer_size_breakdown:
+                if (
+                    factor <= max_allowed_target_dim_size
+                    and factor <= left_layer_dim_size[layer_dim]
+                ):
+                    legal_layer_size_breakdown.append(factor)
+            if len(legal_layer_size_breakdown) > 0:
+                sm_pools_to_add[layer_dim] = [
+                    tuple([layer_dim, size]) for size in legal_layer_size_breakdown
+                ]
+
+        # check if there is anything in the pool
+        if len(sm_pools_to_add) == 0:
+            return user_spatial_mapping
+
+        # Generate possible combination
+        # In the for loop below, we will first try only with OX or OY. Then with their combination.
+        # In the end, we will only keep the best one, which has the maximal value of OX*OY.
+        # If there are multiple combs having the same OX*OY, we will keep the first one, as their cost are the same.
+        best_comb = []  # list initialization
+        best_comb_size = 0  # reference value to find the best comb
+        target_layer_dim = [
+            layer_dim
+            for layer_dim in target_layer_dim
+            if layer_dim in sm_pools_to_add.keys()
+        ]
+        if enable_mix_spatial_mapping_generation:
+            allowed_dim_comb_length = len(target_layer_dim)
+        else:
+            allowed_dim_comb_length = 1
+        for dim_comb_length in range(1, allowed_dim_comb_length + 1):
+            for dim_comb in itertools.combinations(target_layer_dim, dim_comb_length):
+                # we will create a temporal pools for each dim combination
+                sm_pools_mix = []
+                for layer_dim in dim_comb:
+                    sm_pools_mix += sm_pools_to_add[layer_dim]
+                max_comb_length = len(
+                    sm_pools_mix
+                )  # the max possible length of combination
+                for comb_length in range(1, max_comb_length + 1):
+                    for comb in itertools.combinations(sm_pools_mix, comb_length):
+                        # At this point, in comb, we have a possible OX / OY mapping
+                        # First we get current comb size
+                        # Example: comb_mapping = {"OX": 5, "OY", 10}
+                        comb_mapping: dict = {}
+                        for layer_dim in dim_comb:
+                            comb_mapping[layer_dim] = 1  # default value
+                        for element in comb:
+                            layer_dim = element[0]
+                            mapping_size = element[1]
+                            comb_mapping[layer_dim] *= mapping_size
+                        # Skip if current unrolling on a layer_dim is 1, which means it has been checked already.
+                        curr_comb_already_checked = False
+                        for unroll_size in comb_mapping.values():
+                            if unroll_size == 1:
+                                curr_comb_already_checked = True
+                                break
+                        if curr_comb_already_checked:
+                            continue
+                        # We will check if this comb is possible
+                        # (1) check on left_layer_dim_size
+                        curr_comb_illegal = False
+                        for unroll_dim, unroll_size in comb_mapping.items():
+                            if unroll_size > left_layer_dim_size[unroll_dim]:
+                                curr_comb_illegal = True
+                                break
+                        if curr_comb_illegal:
+                            continue
+                        # (2) check on act_served_oa_dim
+                        comb_size = math.prod([v for v in comb_mapping.values()])
+                        required_oa_dim_size = exist_act_loop_size * comb_size
+                        if required_oa_dim_size > act_served_oa_dim_size:
+                            continue  # the comb is not possible on act_served_oa_dim
+                        # (3) check on output_served_oa_dim
+                        required_oa_dim_size = weight_ir_loop_size
+                        for layer_dim in comb_mapping.keys():
+                            existed_pr_mapping = list(
+                                weight_r_loop[layer_dim].values()
+                            )[0]
+                            pr_mapping_to_add = comb_mapping[layer_dim]
+                            new_mapping_size = (
+                                existed_pr_mapping + pr_mapping_to_add - 1
+                            )
+                            required_oa_dim_size *= new_mapping_size
+                        if len(comb_mapping) == 1:  # only OX or OY
+                            # add the other existed pr loop to required_oa_dim_size,
+                            # because previously it is not counted in output_served_oa_dim_size.
+                            sole_dim = list(comb_mapping.keys())[0]
+                            the_other_pr_mapping_name = [
+                                key for key in weight_r_loop.keys() if key != sole_dim
+                            ][0]
+                            the_other_pr_mapping_size = list(
+                                weight_r_loop[the_other_pr_mapping_name].values()
+                            )[0]
+                            required_oa_dim_size *= the_other_pr_mapping_size
+                        if required_oa_dim_size > output_served_oa_dim_size:
+                            continue  # this comb is not possible on output_served_oa_dim
+                        # (4) compare with best_comb
+                        if comb_size > best_comb_size:
+                            # reformat the comb and merge repetitive elements
+                            # example: (("OX", 5), ("OY", 2))
+                            new_comb: list = [
+                                (layer_dim, mapping_size)
+                                for (layer_dim, mapping_size) in comb_mapping.items()
+                            ]
+                            best_comb = new_comb
+
+        # At this point, we get the best possible comb to add. Then we can add that to the current sm mapping
+        if len(best_comb) == 0:  # did not find any comb
+            return user_spatial_mapping
+        else:
+            if (
+                act_served_oa_dim_name in user_spatial_mapping.keys()
+            ):  # there already is sm loop previously
+                act_served_mapping_to_change = user_spatial_mapping[
+                    act_served_oa_dim_name
+                ]
+                if self.is_nested_tuple(
+                    act_served_mapping_to_change
+                ):  # originally it is a mix mapping
+                    reformed_sm = list(act_served_mapping_to_change) + best_comb
+                else:  # originally it is a single layer mapping
+                    reformed_sm = [act_served_mapping_to_change] + best_comb
+            else:  # there is no sm loop on act served oa dim previously
+                reformed_sm = best_comb
+            reformed_sm = tuple(reformed_sm)
+            user_spatial_mapping[act_served_oa_dim_name] = reformed_sm
+
+        return user_spatial_mapping
+
     @staticmethod
     def all_unique(items):
         return len(set(items)) == len(items)
diff --git a/zigzag/classes/stages/CostModelStage.py b/zigzag/classes/stages/CostModelStage.py
index 941c7db0..ce2d135b 100644
--- a/zigzag/classes/stages/CostModelStage.py
+++ b/zigzag/classes/stages/CostModelStage.py
@@ -30,6 +30,7 @@ def __init__(
         accelerator,
         layer,
         spatial_mapping,
+        spatial_mapping_int,
         temporal_mapping,
         access_same_data_considered_as_no_access=True,
         **kwargs
@@ -39,12 +40,14 @@ def __init__(
             self.accelerator,
             self.layer,
             self.spatial_mapping,
+            self.spatial_mapping_int,
             self.temporal_mapping,
             self.access_same_data_considered_as_no_access,
         ) = (
             accelerator,
             layer,
             spatial_mapping,
+            spatial_mapping_int,
             temporal_mapping,
             access_same_data_considered_as_no_access,
         )
@@ -55,6 +58,7 @@ def run(self) -> Generator[Tuple[CostModelEvaluation, Any], None, None]:
             accelerator=self.accelerator,
             layer=self.layer,
             spatial_mapping=self.spatial_mapping,
+            spatial_mapping_int=self.spatial_mapping_int,
             temporal_mapping=self.temporal_mapping,
             # the below parameter is optional
             access_same_data_considered_as_no_access=self.access_same_data_considered_as_no_access,
diff --git a/zigzag/classes/stages/SearchUnusedMemoryStage.py b/zigzag/classes/stages/SearchUnusedMemoryStage.py
index c63136b0..5143ee9f 100644
--- a/zigzag/classes/stages/SearchUnusedMemoryStage.py
+++ b/zigzag/classes/stages/SearchUnusedMemoryStage.py
@@ -302,9 +302,16 @@ def update_top_mem_level(self):
                     if (
                         const_operand in served_operands
                     ):  # identify the top weight mem level
+                        # We need to check if the current mem serve all oa dims, otherwise we will not decrease
+                        # the mem_update_weight.
+                        # The reason is if the current mem not serve all oa dims, the mapping will impact the memory
+                        # utilization, so solely comparing with total memory size will be incorrect.
+                        mem_serve_all_oa_dims = self.check_if_mem_serve_all_oa_dims(
+                            mem, self.accelerator
+                        )
                         if (
                             curr_mem_level < self.mem_update_weight
-                        ):  # mem_update_weight is bigger than the top weight mem level
+                        ) and mem_serve_all_oa_dims:  # mem_update_weight is bigger than the top weight mem level
                             self.mem_update_weight = curr_mem_level
                         break
             else:  ## node (layer) that is not a branch starting node or a branch final node
@@ -402,9 +409,18 @@ def update_top_mem_level(self):
                                 self.update_IO_mem_level(
                                     curr_id, output_operand, curr_mem_level
                                 )  # update output mem level
+                            # For weight, we need to check if the current mem serve all oa dims, otherwise we will not
+                            # decrease the mem_update_weight.
+                            # The reason is if the current mem not serve all oa dims, the mapping will impact the memory
+                            # utilization, so solely comparing with total memory size will be incorrect.
+                            mem_serve_all_oa_dims = self.check_if_mem_serve_all_oa_dims(
+                                mem, self.accelerator
+                            )
                             if (
-                                curr_mem_level < self.mem_update_weight
-                            ) and mem_serve_weight:  # update weight mem level
+                                (curr_mem_level < self.mem_update_weight)
+                                and mem_serve_all_oa_dims
+                                and mem_serve_weight
+                            ):  # update weight mem level
                                 self.mem_update_weight = curr_mem_level
         ## [OPTIONAL CHECK] assert check if there is -1 value in mem_update_list
         ## [NOTE] Until here, if there is still -1 value in mem_update_list, it means the size of top mem level for IO is not big enough.
@@ -414,6 +430,17 @@ def update_top_mem_level(self):
                     list(operand_dict.values())[0] >= 0
                 ), "SearchUnusedMemoryStage fisnishes abnormally, there are still layers with top mem levels not figured out."
 
+    def check_if_mem_serve_all_oa_dims(self, mem, accelerator):
+        # check if mem serve all hardare dimensions
+        core = accelerator.cores[0]
+        operational_array = core.operational_array
+        oa_dim_nb = len(operational_array.dimensions)
+        mem_served_oa_dim_nb = len(mem.served_dimensions)
+        if mem_served_oa_dim_nb == oa_dim_nb:
+            return True
+        else:
+            return False
+
     def update_mem_level_for_loading_data(self):
         """
         [OPTIONAL FUNCTION] This is an optional function.
diff --git a/zigzag/classes/stages/SpatialMappingConversionStage.py b/zigzag/classes/stages/SpatialMappingConversionStage.py
index 7707d172..617921e7 100644
--- a/zigzag/classes/stages/SpatialMappingConversionStage.py
+++ b/zigzag/classes/stages/SpatialMappingConversionStage.py
@@ -52,7 +52,9 @@ def is_nested_tuple(obj):
 
     def run(self):
         user_spatial_mapping = self.layer.user_spatial_mapping
-        spatial_mapping = self.convert_user_spatial_mapping(user_spatial_mapping)
+        spatial_mapping, spatial_mapping_int = self.convert_user_spatial_mapping(
+            user_spatial_mapping
+        )
         # Since the spatial_mapping may be modified in the previous step,
         # we have to update this change to self.layer
         updated_user_spatial_mapping = {}
@@ -75,6 +77,7 @@ def run(self):
 
         kwargs = self.kwargs.copy()
         kwargs["spatial_mapping"] = spatial_mapping
+        kwargs["spatial_mapping_int"] = spatial_mapping_int
         kwargs["accelerator"] = self.accelerator
         kwargs["layer"] = self.layer
 
@@ -104,13 +107,31 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
         oa_dims = core.operational_array.dimensions
         layer_dim_sizes = self.layer.loop_dim_size.copy()
         limited_user_spatial_mapping = {}  # init dict we will be filling
+        limited_user_spatial_mapping_int = {}  # init dict int we will be filling
         for oa_dim_name, spatial_loop in user_spatial_mapping.items():
             if self.is_nested_tuple(spatial_loop):  # mix sm loop
                 limited_mix_user_spatial_mapping_on_dim = []
+                limited_mix_user_spatial_mapping_int_on_dim = []
                 for spatial_loop_element in spatial_loop:
                     limited_user_spatial_mapping_to_check = (
                         self.generate_limited_user_spatial_mapping(
-                            layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop_element
+                            layer_dim_sizes,
+                            oa_dims,
+                            oa_dim_name,
+                            spatial_loop_element,
+                            user_spatial_mapping,
+                            limited_user_spatial_mapping,
+                        )
+                    )
+                    limited_user_spatial_mapping_int_to_check = (
+                        self.generate_limited_user_spatial_mapping(
+                            layer_dim_sizes,
+                            oa_dims,
+                            oa_dim_name,
+                            spatial_loop_element,
+                            user_spatial_mapping,
+                            limited_user_spatial_mapping,
+                            False,
                         )
                     )
                     if limited_user_spatial_mapping_to_check == None:
@@ -119,19 +140,44 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                         limited_mix_user_spatial_mapping_on_dim.append(
                             limited_user_spatial_mapping_to_check
                         )
+                        limited_mix_user_spatial_mapping_int_on_dim.append(
+                            limited_user_spatial_mapping_int_to_check
+                        )
                 if len(limited_mix_user_spatial_mapping_on_dim) == 0:
                     continue  # Skip this spatial dimension if the defined dims in sm don't exist in the layer
                 else:
                     limited_mix_user_spatial_mapping_on_dim = tuple(
                         limited_mix_user_spatial_mapping_on_dim
                     )
+                    limited_mix_user_spatial_mapping_int_on_dim = tuple(
+                        limited_mix_user_spatial_mapping_int_on_dim
+                    )
                     limited_user_spatial_mapping[
                         oa_dim_name
                     ] = limited_mix_user_spatial_mapping_on_dim
+                    limited_user_spatial_mapping_int[
+                        oa_dim_name
+                    ] = limited_mix_user_spatial_mapping_int_on_dim
             else:  # single-dim sm loop
                 limited_user_spatial_mapping_to_check = (
                     self.generate_limited_user_spatial_mapping(
-                        layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop
+                        layer_dim_sizes,
+                        oa_dims,
+                        oa_dim_name,
+                        spatial_loop,
+                        user_spatial_mapping,
+                        limited_user_spatial_mapping,
+                    )
+                )
+                limited_user_spatial_mapping_int_to_check = (
+                    self.generate_limited_user_spatial_mapping(
+                        layer_dim_sizes,
+                        oa_dims,
+                        oa_dim_name,
+                        spatial_loop,
+                        user_spatial_mapping,
+                        limited_user_spatial_mapping,
+                        False,
                     )
                 )
                 if limited_user_spatial_mapping_to_check == None:
@@ -140,6 +186,9 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                     limited_user_spatial_mapping[
                         oa_dim_name
                     ] = limited_user_spatial_mapping_to_check
+                    limited_user_spatial_mapping_int[
+                        oa_dim_name
+                    ] = limited_user_spatial_mapping_int_to_check
             # Update the layer_dim_size to support multiple oa dims unrolling the same loop dim but not unrolling it more than the total layer dim
             # if (
             #     temporal_remainder == 1
@@ -159,15 +208,102 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
             f"User-provided spatial mapping converted to: {user_spatial_mapping_for_log}"
         )
 
+        spatial_mapping_dict = self.generate_spatial_mapping_dict(
+            user_spatial_mapping=limited_user_spatial_mapping,
+            layer=self.layer,
+            accelerator=self.accelerator,
+        )
+        # The next spatial_mapping_dict is used in cost model to calculate the interval between different data transfer.
+        # Different with the one above, there are only integer numbers (corresponding to the real cases)
+        spatial_mapping_dict_int = self.generate_spatial_mapping_dict(
+            user_spatial_mapping=limited_user_spatial_mapping_int,
+            layer=self.layer,
+            accelerator=self.accelerator,
+        )
+
+        return SpatialMapping(
+            spatial_mapping_dict=spatial_mapping_dict, layer_node=self.layer
+        ), SpatialMapping(
+            spatial_mapping_dict=spatial_mapping_dict_int, layer_node=self.layer
+        )
+
+    def generate_limited_user_spatial_mapping(
+        self,
+        layer_dim_sizes,
+        oa_dims,
+        oa_dim_name,
+        spatial_loop,
+        user_spatial_mapping,
+        limited_user_spatial_mapping,
+        allow_decimal_sm_loop_size=True,
+    ):
+        ## Do check on spatial mapping, and convert the mapping to a tuple
+        (loop_dim_unrolled, loop_size_unrolled) = spatial_loop
+        # Check 0: Skip this spatial dimension if it doesn't exist in the layer
+        if loop_dim_unrolled not in layer_dim_sizes.keys():
+            return None
+        # Check 1: Limit unrolling if operational array dimension is smaller than provided unrolling
+        oa_dim_size = next(
+            (oa_dim for oa_dim in oa_dims if oa_dim.name == oa_dim_name)
+        ).size
+        loop_size_unrolled = min(oa_dim_size, loop_size_unrolled)
+        # Check 2: Limit unrolling if layer dimension is smaller than provided unrolling or if the loop dim doesn't exist
+        layer_dim_size = layer_dim_sizes.get(loop_dim_unrolled, 1)
+        loop_size_unrolled = min(layer_dim_size, loop_size_unrolled)
+        # Check 3: Adjust unrolling if it is not a multiple of the layer dimension size
+        # and if there is no more mapping for this layer dimension
+        no_more_mapping_for_current_layer_dim = (
+            self.check_if_there_is_further_oa_mapping_for_current_layer_dim(
+                oa_dim_name=oa_dim_name,
+                loop_dim_unrolled=loop_dim_unrolled,
+                user_spatial_mapping=user_spatial_mapping,
+            )
+        )
+        if no_more_mapping_for_current_layer_dim:
+            loop_size_unrolled_on_early_oa_dims = (
+                self.calc_unrolled_loop_size_on_early_oa_dims(
+                    oa_dim_name=oa_dim_name,
+                    loop_dim_unrolled=loop_dim_unrolled,
+                    user_spatial_mapping=limited_user_spatial_mapping,
+                )
+            )
+            temporal_remainder = int(
+                np.ceil(
+                    layer_dim_size
+                    / (loop_size_unrolled * loop_size_unrolled_on_early_oa_dims)
+                )
+            )
+            if allow_decimal_sm_loop_size:
+                loop_size_unrolled = (
+                    layer_dim_size
+                    / temporal_remainder
+                    / loop_size_unrolled_on_early_oa_dims
+                )
+            else:
+                loop_size_unrolled = int(
+                    np.ceil(
+                        layer_dim_size
+                        / temporal_remainder
+                        / loop_size_unrolled_on_early_oa_dims
+                    )
+                )
+        return (
+            loop_dim_unrolled,
+            loop_size_unrolled,
+        )
+
+    def generate_spatial_mapping_dict(self, user_spatial_mapping, layer, accelerator):
+        # This function is to convert spatial mapping to spatial_mapping_dict,
+        # which attaches spatial mapping to different memory levels.
         spatial_mapping_dict = {}
-        layer_to_mem_op = self.layer.memory_operand_links
+        layer_to_mem_op = layer.memory_operand_links
         mem_to_layer_op = {
             mem_op: layer_op for (layer_op, mem_op) in layer_to_mem_op.items()
         }
-        core_id = self.layer.core_allocation
-        mem_hierarchy = self.accelerator.get_core(core_id).memory_hierarchy
+        core_id = layer.core_allocation
+        mem_hierarchy = accelerator.get_core(core_id).memory_hierarchy
         for mem_op, layer_op in mem_to_layer_op.items():
-            user_sm_copy = limited_user_spatial_mapping.copy()
+            user_sm_copy = user_spatial_mapping.copy()
             # layer_op = mem_to_layer_op[mem_op]
             spatial_mapping_dict[layer_op] = []
             memory_levels = mem_hierarchy.get_memory_levels(
@@ -227,35 +363,84 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
             # After we have gone through the memory levels, if there are still user-defined dimensions
             # present, add them as the top level. Otherwise add an empty list to make arch levels correct:
             # because first list we added was the operational array level.
+
+            # We will merge together if the top memory level is serving multiple oa dims
+            # and there are layer dims existing on multiple oa dims.
+            top_level_spatial_mapping_dict = {}
+            for (dim_name, spatial_loop) in user_sm_copy.items():
+                if self.is_nested_tuple(spatial_loop):  # mix sm loop
+                    for sub_spatial_loop in spatial_loop:
+                        spatial_loop_dim = sub_spatial_loop[0]
+                        spatial_loop_size = sub_spatial_loop[1]
+                        if spatial_loop_dim not in top_level_spatial_mapping_dict.keys():
+                            top_level_spatial_mapping_dict[spatial_loop_dim] = spatial_loop_size
+                        else:
+                            top_level_spatial_mapping_dict[spatial_loop_dim] *= spatial_loop_size
+                else:
+                    spatial_loop_dim = spatial_loop[0]
+                    spatial_loop_size = spatial_loop[1]
+                    if spatial_loop_dim not in top_level_spatial_mapping_dict.keys():
+                        top_level_spatial_mapping_dict[spatial_loop_dim] = spatial_loop_size
+                    else:
+                        top_level_spatial_mapping_dict[spatial_loop_dim] *= spatial_loop_size
             top_level_spatial_mapping = [
-                spatial_loop for (dim_name, spatial_loop) in user_sm_copy.items()
+                (layer_dim, layer_size) for (layer_dim, layer_size) in top_level_spatial_mapping_dict.items()
             ]
             spatial_mapping_dict[layer_op].append(top_level_spatial_mapping)
+        return spatial_mapping_dict
 
-        return SpatialMapping(
-            spatial_mapping_dict=spatial_mapping_dict, layer_node=self.layer
-        )
+    def check_if_there_is_further_oa_mapping_for_current_layer_dim(
+        self, oa_dim_name, loop_dim_unrolled, user_spatial_mapping
+    ):
+        # For the case when there is layer dimension that is mapped on multiple oa dimensions.
+        # We need to decide on which oa dimension to adjust the unrolling
+        # if the total unrolling size is not a multiple of the layer dimension size.
+        # In this case, we decide to only adjust the unrolling size on the last oa dimension,
+        # This function is to check if the current oa dimension is the last oa dimension for the current layer dim.
+        start_check_on_layer_dim_mapping = False
+        no_more_mapping_for_current_layer_dim = True
+        for oa_dim_name_private, spatial_loop_private in user_spatial_mapping.items():
+            if oa_dim_name == oa_dim_name_private:
+                start_check_on_layer_dim_mapping = True
+                continue
+            if start_check_on_layer_dim_mapping:
+                if self.is_nested_tuple(spatial_loop_private):  # mix sm loop
+                    for spatial_loop_element in spatial_loop_private:
+                        loop_dim_unrolled_private = spatial_loop_element[0]
+                        if loop_dim_unrolled == loop_dim_unrolled_private:
+                            no_more_mapping_for_current_layer_dim = False
+                            break
+                else:
+                    loop_dim_unrolled_private = spatial_loop_private[0]
+                    if loop_dim_unrolled == loop_dim_unrolled_private:
+                        no_more_mapping_for_current_layer_dim = False
+            if (
+                not no_more_mapping_for_current_layer_dim
+            ):  # early exit if the flag is already False
+                break
+        return no_more_mapping_for_current_layer_dim
 
-    def generate_limited_user_spatial_mapping(
-        self, layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop
+    def calc_unrolled_loop_size_on_early_oa_dims(
+        self, oa_dim_name, loop_dim_unrolled, user_spatial_mapping
     ):
-        ## Do check on spatial mapping, and convert the mapping to a tuple
-        (loop_dim_unrolled, loop_size_unrolled) = spatial_loop
-        # Check 0: Skip this spatial dimension if it doesn't exist in the layer
-        if loop_dim_unrolled not in layer_dim_sizes.keys():
-            return None
-        # Check 1: Limit unrolling if operational array dimension is smaller than provided unrolling
-        oa_dim_size = next(
-            (oa_dim for oa_dim in oa_dims if oa_dim.name == oa_dim_name)
-        ).size
-        loop_size_unrolled = min(oa_dim_size, loop_size_unrolled)
-        # Check 2: Limit unrolling if layer dimension is smaller than provided unrolling or if the loop dim doesn't exist
-        layer_dim_size = layer_dim_sizes.get(loop_dim_unrolled, 1)
-        loop_size_unrolled = min(layer_dim_size, loop_size_unrolled)
-        # Check 3: Adjust unrolling if it is not a multiple of the layer dimension size
-        temporal_remainder = int(np.ceil(layer_dim_size / loop_size_unrolled))
-        loop_size_unrolled = layer_dim_size / temporal_remainder
-        return (
-            loop_dim_unrolled,
-            loop_size_unrolled,
-        )
+        # calculate the unrolled loop size for the specific layer dim on oa dims earlier than current oa dim
+        loop_unrolled_size_already = 1
+        for oa_dim_name_private, spatial_loop_private in user_spatial_mapping.items():
+            if oa_dim_name == oa_dim_name_private:
+                break
+            if self.is_nested_tuple(spatial_loop_private):  # mix sm loop
+                for spatial_loop_element in spatial_loop_private:
+                    (
+                        loop_dim_unrolled_private,
+                        loop_size_unrolled_private,
+                    ) = spatial_loop_element
+                    if loop_dim_unrolled == loop_dim_unrolled_private:
+                        loop_unrolled_size_already *= loop_size_unrolled_private
+            else:
+                (
+                    loop_dim_unrolled_private,
+                    loop_size_unrolled_private,
+                ) = spatial_loop_private
+                if loop_dim_unrolled == loop_dim_unrolled_private:
+                    loop_unrolled_size_already *= loop_size_unrolled_private
+        return loop_unrolled_size_already
diff --git a/zigzag/classes/stages/SpatialMappingGeneratorStage.py b/zigzag/classes/stages/SpatialMappingGeneratorStage.py
index 77e56fcc..8f7bf9fc 100644
--- a/zigzag/classes/stages/SpatialMappingGeneratorStage.py
+++ b/zigzag/classes/stages/SpatialMappingGeneratorStage.py
@@ -2,11 +2,14 @@
 
 from zigzag.classes.opt.spatial.generator import UserSpatialMappingGenerator
 from zigzag.classes.hardware.architecture.core import Core
+from zigzag.classes.hardware.architecture.accelerator import Accelerator
+from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy
 from zigzag.classes.stages.Stage import Stage
 from zigzag.classes.stages.SpatialMappingConversionStage import (
     SpatialMappingConversionStage,
 )
 import copy
+from zigzag.utils import pickle_deepcopy
 
 logger = logging.getLogger(__name__)
 
@@ -33,6 +36,7 @@ def __init__(
         layer,
         enable_mix_spatial_mapping_generation=False,
         maximize_hardware_utilization=True,
+        enable_weight_diagonal_mapping=False,
         **kwargs,
     ):
         super().__init__(list_of_callables, **kwargs)
@@ -43,6 +47,7 @@ def __init__(
             enable_mix_spatial_mapping_generation
         )
         self.maximize_hardware_utilization = maximize_hardware_utilization
+        self.enable_weight_diagonal_mapping = enable_weight_diagonal_mapping
 
     @staticmethod
     # Check that the layer includes:
@@ -74,6 +79,12 @@ def run(self):
             user_provided_spatial_mappings, dict
         ):  # There is a single USM provided
             if len(user_provided_spatial_mappings) < len(oa_dims):
+                self.layer.user_spatial_mapping_hint = (
+                    self.complete_user_spatial_mapping_hint(
+                        user_spatial_mapping_hint=user_spatial_mapping_hint,
+                        oa_dims=oa_dims,
+                    )
+                )
                 user_spatial_mapping_generator = UserSpatialMappingGenerator(
                     layer=self.layer,
                     accelerator=self.accelerator,
@@ -92,34 +103,18 @@ def run(self):
         ):  # There are multiple USMs provided
             user_spatial_mappings = user_provided_spatial_mappings
         else:  # There is no USM provided
-            # Initialize the user_provided_spatial_mapping_hint
-            if user_spatial_mapping_hint is None:
-                logger.info(
-                    "User-provided spatial mappings or hints not found. Auto-generating spatial_mapping_hint.."
-                )
-                user_spatial_mapping_hint = {}
-                for oa_dim in oa_dims:
-                    user_spatial_mapping_hint[oa_dim.name] = [
-                        layer_dim for layer_dim in self.layer.loop_dim_list
-                    ]
-                    self.layer.user_spatial_mapping_hint = user_spatial_mapping_hint
-            else:
-                oa_dims_name = [oa_dim.name for oa_dim in oa_dims]
-                # Add definition for non-exist dimension in user_spatial_mapping_hint
-                for oa_dim_name in oa_dims_name:
-                    if oa_dim_name not in user_spatial_mapping_hint.keys():
-                        user_spatial_mapping_hint[oa_dim_name] = [
-                            layer_dim for layer_dim in self.layer.loop_dim_list
-                        ]
-                logger.debug(
-                    "No user-provided spatial mapping found, but a hint was found."
+            self.layer.user_spatial_mapping_hint = (
+                self.complete_user_spatial_mapping_hint(
+                    user_spatial_mapping_hint=user_spatial_mapping_hint, oa_dims=oa_dims
                 )
+            )
             # Initialize the UserSpatialMappingGenerator which will automatically generate SMs
             user_spatial_mapping_generator = UserSpatialMappingGenerator(
                 layer=self.layer,
                 accelerator=self.accelerator,
                 enable_mix_spatial_mapping_generation=self.enable_mix_spatial_mapping_generation,
                 maximize_hardware_utilization=self.maximize_hardware_utilization,
+                enable_weight_diagonal_mapping=self.enable_weight_diagonal_mapping,
             )
             # Get all the USMs by running the generator
             user_spatial_mappings = list(
@@ -136,11 +131,177 @@ def run(self):
             self.layer.user_spatial_mapping = user_spatial_mapping
             # Note: manual instantiation of spatial mapping conversion stage here. We let that class deal with
             # everything else, including instantion of the actual substages
-            spatial_mapping_conversion_stage = SpatialMappingConversionStage(
-                self.list_of_callables,
-                accelerator=self.accelerator,
-                layer=copy.copy(self.layer),
-                **self.kwargs,
-            )
+
+            # Modify the size of lower input mem to support weight diagonal spatial unrolling (for OX/OY)
+            if self.enable_weight_diagonal_mapping:
+                (
+                    input_mem_size_updated,
+                    new_accelerator,
+                ) = self.modify_innermost_input_mem_size(core_id, user_spatial_mapping)
+            if self.enable_weight_diagonal_mapping and input_mem_size_updated:
+                original_accelerator = self.accelerator
+                spatial_mapping_conversion_stage = SpatialMappingConversionStage(
+                    self.list_of_callables,
+                    accelerator=new_accelerator,
+                    layer=copy.copy(self.layer),
+                    **self.kwargs,
+                )
+            else:
+                spatial_mapping_conversion_stage = SpatialMappingConversionStage(
+                    self.list_of_callables,
+                    accelerator=self.accelerator,
+                    layer=copy.copy(self.layer),
+                    **self.kwargs,
+                )
             for cme, extra_info in spatial_mapping_conversion_stage.run():
+                if self.enable_weight_diagonal_mapping and input_mem_size_updated:
+                    # recover back the accelerator if its mem size is adjusted before
+                    cme.accelerator = original_accelerator
                 yield cme, (user_spatial_mapping, extra_info)
+
+    def complete_user_spatial_mapping_hint(self, user_spatial_mapping_hint, oa_dims):
+        # This function is to create user_spatial_mapping_hint when it is not provided
+        # or complete it if it is provided but on only part of oa dimensions.
+        complete_user_spatial_mapping_hint = user_spatial_mapping_hint
+        if complete_user_spatial_mapping_hint is None:
+            logger.info(
+                "User-provided spatial mappings hint not found. Auto-generating spatial_mapping_hint.."
+            )
+            complete_user_spatial_mapping_hint = {}
+            for oa_dim in oa_dims:
+                complete_user_spatial_mapping_hint[oa_dim.name] = [
+                    layer_dim for layer_dim in self.layer.loop_dim_list
+                ]
+            # self.layer.user_spatial_mapping_hint = user_spatial_mapping_hint
+        else:
+            oa_dims_name = [oa_dim.name for oa_dim in oa_dims]
+            # Add definition for non-exist dimension in user_spatial_mapping_hint
+            for oa_dim_name in oa_dims_name:
+                if oa_dim_name not in complete_user_spatial_mapping_hint.keys():
+                    complete_user_spatial_mapping_hint[oa_dim_name] = [
+                        layer_dim for layer_dim in self.layer.loop_dim_list
+                    ]
+            # self.layer.user_spatial_mapping_hint = user_spatial_mapping_hint
+        return complete_user_spatial_mapping_hint
+
+    def modify_innermost_input_mem_size(self, core_id, user_spatial_mapping):
+        # To support OX, OY unrolling, we will scale the lowest input mem size by OXu*OYu
+        # to avoid the MemoryTooSmallException in loma stage.
+        input_mem_size_updated = (
+            False  # flag to indicate if the accelerator is modified.
+        )
+        core = self.accelerator.get_core(core_id=core_id)
+        operational_array = core.operational_array
+        oa_dims = operational_array.dimensions
+        memory_hierarchy = copy.deepcopy(core.memory_hierarchy)
+        innermost_levels = memory_hierarchy.get_inner_memories()
+        # get the link from layer op to mem op
+        layer_op_to_mem_op: dict = self.layer.memory_operand_links
+        # check if it is weight stationary.
+        # keep the spatial loop as it was if it is not weight stationary.
+        if len(self.layer.constant_operands) > 1:
+            return input_mem_size_updated, self.accelerator
+        # get weight operand name
+        const_operand = self.layer.constant_operands[0]  # weight representation
+        # get activation operand name
+        act_operand = [
+            operand for operand in self.layer.input_operands if operand != const_operand
+        ][0]
+        # get name of OX, OY (weight ir layer dims)
+        weight_ir_layer_dims: list = self.layer.operand_loop_dim[const_operand]["ir"]
+        # get the oa_dim name served by input innermost memory level
+        for memory_level in innermost_levels:
+            mem_ops = memory_level.operands
+            if layer_op_to_mem_op[act_operand] in mem_ops:
+                act_innermost_mem_level = memory_level
+                act_served_oa_dim: set = memory_level.served_dimensions
+                act_served_oa_dim_name = list(act_served_oa_dim)[0].name
+        # check if act is not served in the innermost memories, or it is uti-casting for act.
+        # keep the spatial loop as it was if act is not served.
+        if "act_served_oa_dim" not in locals() or len(act_served_oa_dim) == 0:
+            return input_mem_size_updated, self.accelerator
+        # get the mem scaling factor if OX, OY exist
+        mem_scaling_factor = 1
+        if (
+            act_served_oa_dim_name not in user_spatial_mapping.keys()
+        ):  # there is no sm loop
+            pass
+        else:  # there is sm loop on act served oa dim
+            act_served_oa_mapping = user_spatial_mapping[act_served_oa_dim_name]
+            if self.is_nested_tuple(
+                act_served_oa_mapping
+            ):  # a mix sm mapping, e.g. (("K", 2), ("OX", 5))
+                for element in act_served_oa_mapping:
+                    layer_dim = element[0]
+                    if layer_dim in weight_ir_layer_dims:
+                        layer_size = element[1]
+                        mem_scaling_factor *= layer_size
+            else:  # a single layer dim mapping
+                layer_dim = act_served_oa_mapping[0]
+                if layer_dim in weight_ir_layer_dims:
+                    layer_size = act_served_oa_mapping[1]
+                    mem_scaling_factor *= layer_size
+        # scale the mem size
+        if mem_scaling_factor == 1:
+            # No need to change the input mem size
+            return input_mem_size_updated, self.accelerator
+        else:
+            input_mem_size_updated = True
+            # Initialize the new memory hierarchy
+            mh_name = memory_hierarchy.name
+            new_mh_name = mh_name + "-supporting-diagonal-map"
+            new_memory_hierarchy = MemoryHierarchy(operational_array, new_mh_name)
+            # Add memories to the new memory hierarchy with the correct attributes
+            for curr_mem_level, memory_level in enumerate(
+                memory_hierarchy.mem_level_list
+            ):
+                memory_instance = memory_level.memory_instance
+                if memory_level == act_innermost_mem_level:
+                    memory_instance.size *= mem_scaling_factor  # scale here. For others, keep them unchanged.
+                operands = tuple(memory_level.operands)
+                port_alloc = memory_level.port_alloc_raw
+                served_dimensions_vec = memory_level.served_dimensions_vec
+                assert len(served_dimensions_vec) >= 1
+                served_dimensions = served_dimensions_vec[0]
+
+                new_memory_instance = pickle_deepcopy(memory_instance)
+                new_operands = pickle_deepcopy(operands)
+                new_port_alloc = pickle_deepcopy(port_alloc)
+                new_served_dimensions = pickle_deepcopy(served_dimensions)
+                new_memory_hierarchy.add_memory(
+                    memory_instance=new_memory_instance,
+                    operands=new_operands,
+                    port_alloc=new_port_alloc,
+                    served_dimensions=new_served_dimensions,
+                )
+            # Create the new core
+            id = core.id
+            dataflows = core.dataflows
+            new_id = id
+            new_dataflows = pickle_deepcopy(dataflows)
+
+            new_core = Core(
+                id=new_id,
+                operational_array=operational_array,
+                memory_hierarchy=new_memory_hierarchy,
+                dataflows=new_dataflows,
+            )
+
+            # Create the new accelerator
+            name = self.accelerator.name
+            new_name = name + "-supporting-diagonal-map"
+            new_cores = {new_core}
+            new_accelerator = Accelerator(
+                name=new_name,
+                core_set=new_cores,
+            )
+            return input_mem_size_updated, new_accelerator
+
+    @staticmethod
+    def is_nested_tuple(obj):
+        if isinstance(obj, tuple):
+            for item in obj:
+                if isinstance(item, tuple):
+                    # If any item within the tuple is itself a tuple, it's a nested tuple
+                    return True
+        return False