From 3944d8317cafa1cb9ef7d57ffa2029660ef0421b Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Fri, 10 Nov 2023 10:34:24 +0100
Subject: [PATCH 01/14] fix: filter out impossible spatial loop when one layer
 dim is mapped on multiple hardware dims; new: adjust the last oa mapping when
 updating the spatial mapping to be a multiple of the layer dim size; new:
 only yield 2 spatial mapping options when maximize_hardware_utilization=True
 (default)

---
 zigzag/classes/cost_model/cost_model.py       |  1 +
 .../mapping/spatial/spatial_mapping.py        |  3 +
 zigzag/classes/opt/spatial/generator.py       | 27 ++++---
 .../stages/SpatialMappingConversionStage.py   | 70 +++++++++++++++++--
 4 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/zigzag/classes/cost_model/cost_model.py b/zigzag/classes/cost_model/cost_model.py
index cf684c0c..2a312969 100644
--- a/zigzag/classes/cost_model/cost_model.py
+++ b/zigzag/classes/cost_model/cost_model.py
@@ -249,6 +249,7 @@ def __init__(
         self.spatial_mapping_dict_int = spatial_mapping_fractional_to_int(
             self.spatial_mapping.mapping_dict_origin
         )
+        self.spatial_mapping_dict_int = self.spatial_mapping.mapping_dict_origin
 
         # For constructing Mapping object,  the last parameter "self.access_same_data_considered_as_no_access" is optional
         self.mapping = Mapping(
diff --git a/zigzag/classes/mapping/spatial/spatial_mapping.py b/zigzag/classes/mapping/spatial/spatial_mapping.py
index 7db9b6b0..2a4268d9 100644
--- a/zigzag/classes/mapping/spatial/spatial_mapping.py
+++ b/zigzag/classes/mapping/spatial/spatial_mapping.py
@@ -119,6 +119,9 @@ def calc_unit_count(self):
 
         """ ASSERT: The bottom level (MAC level) unit count must be the same for all operand """
         bottom_unit_count = [unit_count[op][0] for op in unit_count.keys()]
+        for x in bottom_unit_count:
+            if not x == bottom_unit_count[0]:
+                pass
         assert all(
             x == bottom_unit_count[0] for x in bottom_unit_count
         ), f"The MAC level unit count is not the same for all operand {bottom_unit_count}, please correct the spatial mapping."
diff --git a/zigzag/classes/opt/spatial/generator.py b/zigzag/classes/opt/spatial/generator.py
index e10453eb..3743afb8 100644
--- a/zigzag/classes/opt/spatial/generator.py
+++ b/zigzag/classes/opt/spatial/generator.py
@@ -161,20 +161,6 @@ def generate_user_spatial_mappings(
                     ) = self.sort_oa_dim_unrollings_in_the_order_of_utilization(
                         oa_dim_unrollings, descending=True
                     )
-                    if len(oa_dim_unrollings) > 0:  # oa_dim_unrollings is not []
-                        # Then only keep the combs in oa_dim_unrollings that have the highest oa_dim mapping utilization
-                        # The closer to the front, the higher the oa_dim utilization rate.
-                        updated_oa_dim_unrollings = [oa_dim_unrollings[0]]
-                        # Check if there are other sm loops that has the same utilization with the highest one.
-                        for i in range(1, len(hardware_utilization)):
-                            if hardware_utilization[i] == hardware_utilization[0]:
-                                updated_oa_dim_unrollings.append(oa_dim_unrollings[i])
-                        # [Optional] To reduce the simulation time, when there are still too many spatial unrollings,
-                        # We keep only the first two unrollings for each oa_dim.
-                        # You can comment out the next two lines if you want to check all spatial unrollings.
-                        if len(updated_oa_dim_unrollings) > 2:
-                            updated_oa_dim_unrollings = updated_oa_dim_unrollings[0:2]
-                        oa_dim_unrollings = updated_oa_dim_unrollings
 
                 # In case there are no unrollings (of size > 1) possible, add a single unrolling of size 1.
                 # The loop dimension we pick is randomly chosen as the first loop dimension in the layer.
@@ -186,7 +172,13 @@ def generate_user_spatial_mappings(
 
         # Now we have for each operational array dimension the layer dimensions and size they can be unrolled without fractional remainder.
         # Now we have to combine them into user-defined spatial mappings.
+        # record down the number of yield
+        yield_count = 0
         for combination in itertools.product(*unrollings):
+            if maximize_hardware_utilization and yield_count >= 2:
+                # 2 means: only check the top 2 spatial mapping with the highest hardware utilization
+                # Please modify "2" to other numbers if you want to check on more spatial mappings.
+                break
             # Zip the combination (which is a (layer_dim, layer_size) for each oa_dim with the oa_dim names.
             oa_dim_names = [oa_dim.name for oa_dim in oa_dims]
             # Extra check on the total unrolling size of a layer dim, if it is mapped on >=2 dimensions.
@@ -194,6 +186,7 @@ def generate_user_spatial_mappings(
                 layer_dim: layer_size
                 for layer_dim, layer_size in self.layer.loop_dim_size.items()
             }
+            check_passed = True # initialization
             for unrolling_in_combination in combination:
                 if unrolling_in_combination is None:
                     continue
@@ -223,7 +216,10 @@ def generate_user_spatial_mappings(
             for layer_dim, layer_size in combination_check.items():
                 if layer_size < 1:  # the layer size/the unrolling size < 1
                     # It means the unrolling size > the layer size, which is incorrect and impossible.
-                    continue
+                    check_passed = False
+                    break
+            if not check_passed:
+                continue
 
             user_spatial_mapping = {
                 oa_dim_name: unrolling
@@ -231,6 +227,7 @@ def generate_user_spatial_mappings(
                 if unrolling is not None
             }
             yield user_spatial_mapping
+            yield_count += 1
 
     def append_mix_spatial_unrollings(
         self, provided_oa_dim_unrollings, provided_oa_dim_unrolling_hints, oa_dim
diff --git a/zigzag/classes/stages/SpatialMappingConversionStage.py b/zigzag/classes/stages/SpatialMappingConversionStage.py
index 7707d172..44cbf89f 100644
--- a/zigzag/classes/stages/SpatialMappingConversionStage.py
+++ b/zigzag/classes/stages/SpatialMappingConversionStage.py
@@ -110,7 +110,7 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                 for spatial_loop_element in spatial_loop:
                     limited_user_spatial_mapping_to_check = (
                         self.generate_limited_user_spatial_mapping(
-                            layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop_element
+                            layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop_element, user_spatial_mapping
                         )
                     )
                     if limited_user_spatial_mapping_to_check == None:
@@ -131,7 +131,7 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
             else:  # single-dim sm loop
                 limited_user_spatial_mapping_to_check = (
                     self.generate_limited_user_spatial_mapping(
-                        layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop
+                        layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop, user_spatial_mapping
                     )
                 )
                 if limited_user_spatial_mapping_to_check == None:
@@ -237,7 +237,7 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
         )
 
     def generate_limited_user_spatial_mapping(
-        self, layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop
+        self, layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop, user_spatial_mapping
     ):
         ## Do check on spatial mapping, and convert the mapping to a tuple
         (loop_dim_unrolled, loop_size_unrolled) = spatial_loop
@@ -253,9 +253,69 @@ def generate_limited_user_spatial_mapping(
         layer_dim_size = layer_dim_sizes.get(loop_dim_unrolled, 1)
         loop_size_unrolled = min(layer_dim_size, loop_size_unrolled)
         # Check 3: Adjust unrolling if it is not a multiple of the layer dimension size
-        temporal_remainder = int(np.ceil(layer_dim_size / loop_size_unrolled))
-        loop_size_unrolled = layer_dim_size / temporal_remainder
+        # and if there is no more mapping for this layer dimension
+        no_more_mapping_for_current_layer_dim = self.check_if_there_is_further_oa_mapping_for_current_layer_dim(
+                oa_dim_name=oa_dim_name,
+                loop_dim_unrolled=loop_dim_unrolled,
+                user_spatial_mapping=user_spatial_mapping
+        )
+        if no_more_mapping_for_current_layer_dim:
+            loop_size_unrolled_on_early_oa_dims = self.calc_unrolled_loop_size_on_early_oa_dims(
+                oa_dim_name=oa_dim_name,
+                loop_dim_unrolled=loop_dim_unrolled,
+                user_spatial_mapping=user_spatial_mapping
+            )
+            temporal_remainder = int(np.ceil(layer_dim_size / (loop_size_unrolled*loop_size_unrolled_on_early_oa_dims)))
+            loop_size_unrolled = layer_dim_size / temporal_remainder / loop_size_unrolled_on_early_oa_dims
         return (
             loop_dim_unrolled,
             loop_size_unrolled,
         )
+
+    def check_if_there_is_further_oa_mapping_for_current_layer_dim(
+            self, oa_dim_name, loop_dim_unrolled, user_spatial_mapping
+    ):
+        # For the case when there is layer dimension that is mapped on multiple oa dimensions.
+        # We need to decide on which oa dimension to adjust the unrolling
+        # if the total unrolling size is not a multiple of the layer dimension size.
+        # In this case, we decide to only adjust the unrolling size on the last oa dimension,
+        # This function is to check if the current oa dimension is the last oa dimension for the current layer dim.
+        start_check_on_layer_dim_mapping = False
+        no_more_mapping_for_current_layer_dim = True
+        for oa_dim_name_private, spatial_loop_private in user_spatial_mapping.items():
+            if oa_dim_name == oa_dim_name_private:
+                start_check_on_layer_dim_mapping = True
+                continue
+            if start_check_on_layer_dim_mapping:
+                if self.is_nested_tuple(spatial_loop_private):  # mix sm loop
+                    for spatial_loop_element in spatial_loop_private:
+                        loop_dim_unrolled_private = spatial_loop_element[0]
+                        if loop_dim_unrolled == loop_dim_unrolled_private:
+                            no_more_mapping_for_current_layer_dim = False
+                            break
+                else:
+                    loop_dim_unrolled_private = spatial_loop_private[0]
+                    if loop_dim_unrolled == loop_dim_unrolled_private:
+                        no_more_mapping_for_current_layer_dim = False
+            if not no_more_mapping_for_current_layer_dim: # early exit if the flag is already False
+                break
+        return no_more_mapping_for_current_layer_dim
+
+    def calc_unrolled_loop_size_on_early_oa_dims(
+            self, oa_dim_name, loop_dim_unrolled, user_spatial_mapping
+    ):
+        # calculate the unrolled loop size for the specific layer dim on oa dims earlier than current oa dim
+        loop_unrolled_size_already = 1
+        for oa_dim_name_private, spatial_loop_private in user_spatial_mapping.items():
+            if oa_dim_name == oa_dim_name_private:
+                break
+            if self.is_nested_tuple(spatial_loop_private):  # mix sm loop
+                    for spatial_loop_element in spatial_loop_private:
+                        (loop_dim_unrolled_private, loop_size_unrolled_private) = spatial_loop_element
+                        if loop_dim_unrolled == loop_dim_unrolled_private:
+                            loop_unrolled_size_already *= loop_size_unrolled_private
+            else:
+                (loop_dim_unrolled_private, loop_size_unrolled_private) = spatial_loop_private
+                if loop_dim_unrolled == loop_dim_unrolled_private:
+                    loop_unrolled_size_already *= loop_size_unrolled_private
+        return loop_unrolled_size_already
\ No newline at end of file

From 3f0f389f99abafaa6b287281e30fe6cfdd2d093d Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Fri, 10 Nov 2023 10:35:21 +0100
Subject: [PATCH 02/14] remove code for debugging

---
 zigzag/classes/cost_model/cost_model.py           | 6 +++---
 zigzag/classes/mapping/spatial/spatial_mapping.py | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/zigzag/classes/cost_model/cost_model.py b/zigzag/classes/cost_model/cost_model.py
index 2a312969..cf25622a 100644
--- a/zigzag/classes/cost_model/cost_model.py
+++ b/zigzag/classes/cost_model/cost_model.py
@@ -246,9 +246,9 @@ def __init__(
 
         """ generate the integer spatial mapping from fractional spatial mapping (due to greedy mapping support).
         Later the fractional one is used for calculating energy, and the integer one is used for calculating latency"""
-        self.spatial_mapping_dict_int = spatial_mapping_fractional_to_int(
-            self.spatial_mapping.mapping_dict_origin
-        )
+        # self.spatial_mapping_dict_int = spatial_mapping_fractional_to_int(
+        #     self.spatial_mapping.mapping_dict_origin
+        # )
         self.spatial_mapping_dict_int = self.spatial_mapping.mapping_dict_origin
 
         # For constructing Mapping object,  the last parameter "self.access_same_data_considered_as_no_access" is optional
diff --git a/zigzag/classes/mapping/spatial/spatial_mapping.py b/zigzag/classes/mapping/spatial/spatial_mapping.py
index 2a4268d9..7db9b6b0 100644
--- a/zigzag/classes/mapping/spatial/spatial_mapping.py
+++ b/zigzag/classes/mapping/spatial/spatial_mapping.py
@@ -119,9 +119,6 @@ def calc_unit_count(self):
 
         """ ASSERT: The bottom level (MAC level) unit count must be the same for all operand """
         bottom_unit_count = [unit_count[op][0] for op in unit_count.keys()]
-        for x in bottom_unit_count:
-            if not x == bottom_unit_count[0]:
-                pass
         assert all(
             x == bottom_unit_count[0] for x in bottom_unit_count
         ), f"The MAC level unit count is not the same for all operand {bottom_unit_count}, please correct the spatial mapping."

From 675fbfb8f26097049aeb1dd8325feb7295431a0f Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Fri, 10 Nov 2023 12:45:53 +0100
Subject: [PATCH 03/14] Fix: add spatial_mapping_hint completion when a partial
 spatial_mapping is provided. New: add assertion when no legal spatial mapping
 is found.

---
 .gitignore                                    |   5 +-
 debug.py                                      |  26 ++++++++
 zigzag/classes/opt/spatial/generator.py       |   6 +-
 .../.SpatialMappingGeneratorStage.py.swp      | Bin 0 -> 16384 bytes
 .../stages/SpatialMappingGeneratorStage.py    |  57 +++++++++++-------
 5 files changed, 70 insertions(+), 24 deletions(-)
 create mode 100644 debug.py
 create mode 100644 zigzag/classes/stages/.SpatialMappingGeneratorStage.py.swp

diff --git a/.gitignore b/.gitignore
index 08983801..89807f12 100644
--- a/.gitignore
+++ b/.gitignore
@@ -149,4 +149,7 @@ docs/Makefile
 docs/make.bat
 
 # documentation output
-html/
\ No newline at end of file
+html/
+
+# debug file
+debug*
\ No newline at end of file
diff --git a/debug.py b/debug.py
new file mode 100644
index 00000000..918bf98a
--- /dev/null
+++ b/debug.py
@@ -0,0 +1,26 @@
+from zigzag.api import get_hardware_performance_zigzag
+
+opt = 'EDP'
+model = "alexnet"
+onnx_model_path = f"zigzag/inputs/examples/workload/{model}.onnx"
+workload = onnx_model_path
+
+hwarchs = ["Edge_TPU_like", "Ascend_like", "Eyeriss_like", "Meta_prototype", "Tesla_NPU_like", "TPU_like"]
+
+for hwarch in hwarchs:
+
+    mapping = f"zigzag.inputs.examples.mapping.default"
+    accelerator = f"zigzag.inputs.examples.hardware.{hwarch}"
+
+    dump_filename_pattern=f"outputs/{hwarch}-{model}-layer_?.json"
+    pickle_filename = f"outputs/{hwarch}-{model}-saved_list_of_cmes.pickle"
+
+    energy, latency, cme = get_hardware_performance_zigzag(workload=workload,
+                                                           accelerator=accelerator,
+                                                           mapping=mapping,
+                                                           opt=opt,
+                                                           dump_filename_pattern=dump_filename_pattern,
+                                                           pickle_filename=pickle_filename)
+    print(f"Total network energy = {energy:.2e} pJ")
+    print(f"Total network latency = {latency:.2e} cycles")
+    print(f"Total edp = {energy*latency:.2e} pJ*cycles")
\ No newline at end of file
diff --git a/zigzag/classes/opt/spatial/generator.py b/zigzag/classes/opt/spatial/generator.py
index 3743afb8..3e5fe140 100644
--- a/zigzag/classes/opt/spatial/generator.py
+++ b/zigzag/classes/opt/spatial/generator.py
@@ -177,7 +177,7 @@ def generate_user_spatial_mappings(
         for combination in itertools.product(*unrollings):
             if maximize_hardware_utilization and yield_count >= 2:
                 # 2 means: only check the top 2 spatial mapping with the highest hardware utilization
-                # Please modify "2" to other numbers if you want to check on more spatial mappings.
+                # Modify "2" to other numbers if you want to check on more spatial mappings.
                 break
             # Zip the combination (which is a (layer_dim, layer_size) for each oa_dim with the oa_dim names.
             oa_dim_names = [oa_dim.name for oa_dim in oa_dims]
@@ -228,6 +228,10 @@ def generate_user_spatial_mappings(
             }
             yield user_spatial_mapping
             yield_count += 1
+        # If yield_count==0, it means there is no legal spatial mapping found.
+        # The reason is that the spatial mapping provided by the user has exceeded the layer dim size,
+        # therefore the loop cannot pass the check.
+        assert yield_count > 0, "There is no legal spatial mapping found. Please make sure the provided spatial mappings do not exceed the layer dimension size."
 
     def append_mix_spatial_unrollings(
         self, provided_oa_dim_unrollings, provided_oa_dim_unrolling_hints, oa_dim
diff --git a/zigzag/classes/stages/.SpatialMappingGeneratorStage.py.swp b/zigzag/classes/stages/.SpatialMappingGeneratorStage.py.swp
new file mode 100644
index 0000000000000000000000000000000000000000..967ac254e9fd94a420b53a665ecee44f6e05fdf8
GIT binary patch
literal 16384
zcmeHNU8o&b72c^fCfaCets<$o&9U~}>$!7t4TxTHp)u7o(4=B=Qz6D-?#w>tOz)hT
zlli$fC*fA;4+w%_tq<aZmEeOQLRBc(2Ok7|s95m_@rOQ%55|I`LeYo%tu=ej%$%9?
zV^t_*2EH?A_Uyg(UTd$l_F8-P*3k>6&&UrOM+L6ih1hrKtoqIA8@H{!_<r$7l=xIU
z?;5za6|H#IFjC>pnxP^qHntN_B|FMDw$!-!qeO5Oc*I_{14l)xTcPFKeX|>cW-@fF
z*!6qnmK7=Us@uD2^;Qf2snYhUF1I}^iqJlatsZXYhG=SeXRP566iz8$g;pGd=jg35
z9E}Ii{Ana`Q%m5GIQGFcHuct#_sTo(*u1Hw(J<dgppif$fkpz21R4o65@;mQNZ@9Z
zKs>xfd=^aKq8a~;zVDm3zd=8r(06m<y?$(NjRYDAG!kee&`6+>KqG-h0*wS32{aOD
zB+y8pk-+Ov0+u7hdoZr=Qo@h#|5^V3*S87r7vMV32iAdC-YUeefL{PV27Uy59e4)#
z8qfzm1>6tZ0UQAK1K&O*#J7M5I0hU5Zrmosqre5=e&9pEVc>S)UvClO&%krQ=YbIr
z0>^>7fPcJMh~EO&fy+P-xEnYO90dMwP>AmX-vhn?d=7X3_$csp;FSYHyafCJ_%iSi
za1^*3I1K#oO+q{l41s%rm-Y+sEN~gv0FDFq0Dr#~asZwH-Us~djTkfV4d7v54R{qB
zC(i?$z^m9~c^P;XcpUf$a4&Ed@Ds?N=HNRt53?_+FXg#i=|--PQxLwb+9G?A-4mS0
zFr)NrggHnLM?`W%UgQ_UFxYV&<y2~1DiKm%q}EF%M1HAKLZ$5_8c3YekaTx_+^PdG
zd)e^}BTW4EQk7C&D;l=vMsbzNmOQ0mX?dQE`$}$}I}^#RkpyWUM?tu3++k(R)xL>~
zfpx_lxL1|gw?b#v3YD3}uIFCG8I!=@h?LhgW>)D``XQ>yx3)ZG4%{n~0?#BeroK?s
z`d%E>b$!gKp6!j<GhI)^vTfz1CumY`3kCHpJZq%Fv@ZXx)~cJ+jBn`6hTMdzOgY9=
zo%8y63oO%d2hoPU*Um!iIQp(1r;?g57L(ponPC-RMvbZTz!F!pgTc^KF?6WT7KnbC
zJ(#qod>9feXR3uR$>Us0bGnq<=hC{AEqT7LLIu|#EeZF~^VlwAMy@27*&TJH<J$4K
z!GymVm+NtWXFCX$5e!Ltpw5<OT489749tu2Ba}qrHdJ$6p2Tk^s)CvsJr$c&qfI}i
z>ul(IKJ&UDYn)C87s3|=Hupd7Y^w0f&8Jo?6UqcRX~^WH=I90!f{~JgYExZ3cP$wQ
z?5qZA5QL)@PwgmA((yM(YNdo<;^S>HRKrk3%8!{xgGshp@{@@V3Yc_06h}@_J3jJ*
z#Mufw^T3MZD#=Lh_FcO#;p;Gi{*DS!NsO5eRh)#rJRc@%TqbdamK!N~!SWLIu`mq6
zc0N1w+6#I;6&iNv#yBA8wYx_iBtIdq@m%f6z7<J7V3i^Nt>?@*l2ajLpeDw|ypn34
z4g869X~u9eLgj`u4|ZSK+a~wY=HKyz+?TSw&V0)vg$N4{MieSL_C|tOZKc2RbeAWQ
z3DE*Luw37C{b3SE)U_YPP>{GIKW<@FrING{2z!DGYIb0~xoo*I$=lHru6tbJ4oN<G
z4&9ilYI$O+<3QIuNt~j#6_>_ndkIcrrXJ)zs1D|8pyGZ|n19yknM)oNf98800<oCS
zEj=x?&<UzZ7Y}2hCKyicRdb6(G0%)(av8;{29)0y+!6>Bp7Lr59A10LFfr9`Y?^I~
zRLr1jLhC(R^`nklUfxD1(Th?66g+nJB41`^N^*m&Ft59iI5t?_@<uF#{IS4NUbvk9
zt#Xrd4iA=ABwHHu&+p;V$QvaQ4&jnYIxtviC2;^BME<}VX-$SH$5;v~-Jpenoj-J5
z`id197RA~s%Ynj4_E@yKqz8WG2@m?4^!n)lcnr!(hY3H4#ZmJ4VRCg*940o73X8O)
zOt^fIxXsitb1ezS#k_%ZKiFk+V!!QTyddhsF=YO{uol9-(prf2Gz1sb!j(Y~#WMBz
zSkcMn!%^>m+kC|&*};8-ftGGMu^_<OVg218@f}pmqFzK6D%694Xe81A6yTn&9TG(p
zFq7xNkU)7w8wdTH)>t>ny$dzxw(Nm)IlAMrf?^2>{uAyHDGQV!U7*+#AzF2?Vx>7s
zw!%fS=i=zNwAPUbrK<>;E|wE8N1?%kyt1@S)3*l7G?#>M2g4vFuZ+{)sfm@RazaNs
zq8kPSnVu#$^l5T~#}yefTT0zidnN@sc%Ic{wW+P0<rjt(+I=^M)+Cq=mZa|GB#F0f
zsKGEcwA3IFecnCmaT7XJW@4<#V@YXg3d;XqfS7OS{Qrc`Jzqq={|s;mI0pP0dHhpA
z2sps4!1KuCzXEIn2Z861yMGh744eY?0e?fj{#D=-@M+*q;Gf9RzXUu9yc75(a&iT{
z3wQzf_jMov9srgA%E^C*Jp3u(i+}=-0!zTpk%vD4P+q<tco}*4lfW);5|F^}k%vDG
zJO&&C?g9RYeEeyk2fPFLDRS_~01tRKK=Sz0{9Jr?rvCbREA^$rx%rvN*v(VJ*1XV`
zlO@>c{2*G7gKy={hT<Dur6{B09fkY}VO74Q8(t6$sgg;{hbXqs{?F(0Vw%6OH@O53
zE>`Fu`YY7oI1a;@PubX3qo`e2?P*Z7K~kg!N;-k!5#-4nRc^@>4psw7>k&r7|59r|
z@K@9oFq|(F6m2dh`C?zhZ2D}FD=iF-=;@mI6jk?v{tsG_cSrnZS3dtFfcy!=Tv!|C
z&sR|xRZYkzt{|QAbj|MqBEPmLGPdYmj)|${2zl_8EX}MgkF;&TO2~`8Ax|KxS}A0E
zdS%FVf!Hz)<A@;I*_E!VhU<5OJXVIDFTeo;l5+O7K$}JULQ7(5Gg^`o8SHOzIx`l<
zRuWU>68PQ-5e-)SfGjgNgD8zDiAf5xFw$<~+iW=aj~v?$l@+V$rh+UAkw@AqrE6Kw
zf=EsnGAUkXI;l=(K-`v-%>_1txH5H?T_aa{jzm({q5akBhH6KuD{*Kc_vi*?RU&=m
zc6kL-+)3>~kcf<*;%Ad1pmblhtk-42G_>Vqw!1W|$&-a_)NG;HwHb~O&<sbs*I{ts
zQnhj+P+VZgF}3$fiY2nH={}D~SYbIrng|eclOSp>iO@V8A?C+c3OT_J<VdLqJ*WHc
ztYT0`im5FGauy=_WGjs*1<}Z^mQHsDmQR(k*1VTP+N?*O#lo<MY<6_qOnZBVd{`l0
zpiCa=e44vB3gmX(xG%D<=w5J+b;<)d)D@RVwB$MNsorWxqljsqTkKinsZppdCvJ#b
z23t`xg<a~cs%f-u1S4kQPfF*S`+>E+)ocD%?fz=Cw=S>M@wc~#GbHnI>+PGER(>*|
zeNI(tl@WZ!n6GKWgY?Iny_&PkXX}ju%phOdTRUni>0u-7Y{1htV~MC=8L$YH<y(HD
zuKxRPICTQA|9<mYzui<vJ$uiDjbLV#s!v*MXN4&*RJGc}f9Ca4hnkynnO>dDU)%SU
zmOM?7riBk)iVb^ndN!%I9CuxOeJkzkWLwWl;(MH64c0%^snb#B)Xz4xtWgBC=qH<8
zX8+sIH5?76afbffg57L<fGo!h8S`~)kX3Mb7G+evD2!!S{IEmYJ%hxHU3^(ezj$Wt
P$JMFV*_1oH;U@kK?b^56

literal 0
HcmV?d00001

diff --git a/zigzag/classes/stages/SpatialMappingGeneratorStage.py b/zigzag/classes/stages/SpatialMappingGeneratorStage.py
index 77e56fcc..85b81719 100644
--- a/zigzag/classes/stages/SpatialMappingGeneratorStage.py
+++ b/zigzag/classes/stages/SpatialMappingGeneratorStage.py
@@ -74,6 +74,10 @@ def run(self):
             user_provided_spatial_mappings, dict
         ):  # There is a single USM provided
             if len(user_provided_spatial_mappings) < len(oa_dims):
+                self.layer.user_spatial_mapping_hint = self.complete_user_spatial_mapping_hint(
+                    user_spatial_mapping_hint=user_spatial_mapping_hint,
+                    oa_dims=oa_dims
+                )
                 user_spatial_mapping_generator = UserSpatialMappingGenerator(
                     layer=self.layer,
                     accelerator=self.accelerator,
@@ -92,28 +96,10 @@ def run(self):
         ):  # There are multiple USMs provided
             user_spatial_mappings = user_provided_spatial_mappings
         else:  # There is no USM provided
-            # Initialize the user_provided_spatial_mapping_hint
-            if user_spatial_mapping_hint is None:
-                logger.info(
-                    "User-provided spatial mappings or hints not found. Auto-generating spatial_mapping_hint.."
-                )
-                user_spatial_mapping_hint = {}
-                for oa_dim in oa_dims:
-                    user_spatial_mapping_hint[oa_dim.name] = [
-                        layer_dim for layer_dim in self.layer.loop_dim_list
-                    ]
-                    self.layer.user_spatial_mapping_hint = user_spatial_mapping_hint
-            else:
-                oa_dims_name = [oa_dim.name for oa_dim in oa_dims]
-                # Add definition for non-exist dimension in user_spatial_mapping_hint
-                for oa_dim_name in oa_dims_name:
-                    if oa_dim_name not in user_spatial_mapping_hint.keys():
-                        user_spatial_mapping_hint[oa_dim_name] = [
-                            layer_dim for layer_dim in self.layer.loop_dim_list
-                        ]
-                logger.debug(
-                    "No user-provided spatial mapping found, but a hint was found."
-                )
+            self.layer.user_spatial_mapping_hint = self.complete_user_spatial_mapping_hint(
+                user_spatial_mapping_hint=user_spatial_mapping_hint,
+                oa_dims=oa_dims
+            )
             # Initialize the UserSpatialMappingGenerator which will automatically generate SMs
             user_spatial_mapping_generator = UserSpatialMappingGenerator(
                 layer=self.layer,
@@ -144,3 +130,30 @@ def run(self):
             )
             for cme, extra_info in spatial_mapping_conversion_stage.run():
                 yield cme, (user_spatial_mapping, extra_info)
+
+    def complete_user_spatial_mapping_hint(
+            self, user_spatial_mapping_hint, oa_dims
+    ):
+        # This function is to create user_spatial_mapping_hint when it is not provided
+        # or complete it if it is provided but on only part of oa dimensions.
+        complete_user_spatial_mapping_hint = user_spatial_mapping_hint
+        if complete_user_spatial_mapping_hint is None:
+            logger.info(
+                "User-provided spatial mappings hint not found. Auto-generating spatial_mapping_hint.."
+            )
+            complete_user_spatial_mapping_hint = {}
+            for oa_dim in oa_dims:
+                complete_user_spatial_mapping_hint[oa_dim.name] = [
+                    layer_dim for layer_dim in self.layer.loop_dim_list
+                ]
+            # self.layer.user_spatial_mapping_hint = user_spatial_mapping_hint
+        else:
+            oa_dims_name = [oa_dim.name for oa_dim in oa_dims]
+            # Add definition for non-exist dimension in user_spatial_mapping_hint
+            for oa_dim_name in oa_dims_name:
+                if oa_dim_name not in complete_user_spatial_mapping_hint.keys():
+                    complete_user_spatial_mapping_hint[oa_dim_name] = [
+                        layer_dim for layer_dim in self.layer.loop_dim_list
+                    ]
+            # self.layer.user_spatial_mapping_hint = user_spatial_mapping_hint
+        return complete_user_spatial_mapping_hint
\ No newline at end of file

From 6094c268501dd08013772046491f8e3441d200c2 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Fri, 10 Nov 2023 12:46:42 +0100
Subject: [PATCH 04/14] Remove temporal file generated by gvim

---
 .../stages/.SpatialMappingGeneratorStage.py.swp | Bin 16384 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 zigzag/classes/stages/.SpatialMappingGeneratorStage.py.swp

diff --git a/zigzag/classes/stages/.SpatialMappingGeneratorStage.py.swp b/zigzag/classes/stages/.SpatialMappingGeneratorStage.py.swp
deleted file mode 100644
index 967ac254e9fd94a420b53a665ecee44f6e05fdf8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeHNU8o&b72c^fCfaCets<$o&9U~}>$!7t4TxTHp)u7o(4=B=Qz6D-?#w>tOz)hT
zlli$fC*fA;4+w%_tq<aZmEeOQLRBc(2Ok7|s95m_@rOQ%55|I`LeYo%tu=ej%$%9?
zV^t_*2EH?A_Uyg(UTd$l_F8-P*3k>6&&UrOM+L6ih1hrKtoqIA8@H{!_<r$7l=xIU
z?;5za6|H#IFjC>pnxP^qHntN_B|FMDw$!-!qeO5Oc*I_{14l)xTcPFKeX|>cW-@fF
z*!6qnmK7=Us@uD2^;Qf2snYhUF1I}^iqJlatsZXYhG=SeXRP566iz8$g;pGd=jg35
z9E}Ii{Ana`Q%m5GIQGFcHuct#_sTo(*u1Hw(J<dgppif$fkpz21R4o65@;mQNZ@9Z
zKs>xfd=^aKq8a~;zVDm3zd=8r(06m<y?$(NjRYDAG!kee&`6+>KqG-h0*wS32{aOD
zB+y8pk-+Ov0+u7hdoZr=Qo@h#|5^V3*S87r7vMV32iAdC-YUeefL{PV27Uy59e4)#
z8qfzm1>6tZ0UQAK1K&O*#J7M5I0hU5Zrmosqre5=e&9pEVc>S)UvClO&%krQ=YbIr
z0>^>7fPcJMh~EO&fy+P-xEnYO90dMwP>AmX-vhn?d=7X3_$csp;FSYHyafCJ_%iSi
za1^*3I1K#oO+q{l41s%rm-Y+sEN~gv0FDFq0Dr#~asZwH-Us~djTkfV4d7v54R{qB
zC(i?$z^m9~c^P;XcpUf$a4&Ed@Ds?N=HNRt53?_+FXg#i=|--PQxLwb+9G?A-4mS0
zFr)NrggHnLM?`W%UgQ_UFxYV&<y2~1DiKm%q}EF%M1HAKLZ$5_8c3YekaTx_+^PdG
zd)e^}BTW4EQk7C&D;l=vMsbzNmOQ0mX?dQE`$}$}I}^#RkpyWUM?tu3++k(R)xL>~
zfpx_lxL1|gw?b#v3YD3}uIFCG8I!=@h?LhgW>)D``XQ>yx3)ZG4%{n~0?#BeroK?s
z`d%E>b$!gKp6!j<GhI)^vTfz1CumY`3kCHpJZq%Fv@ZXx)~cJ+jBn`6hTMdzOgY9=
zo%8y63oO%d2hoPU*Um!iIQp(1r;?g57L(ponPC-RMvbZTz!F!pgTc^KF?6WT7KnbC
zJ(#qod>9feXR3uR$>Us0bGnq<=hC{AEqT7LLIu|#EeZF~^VlwAMy@27*&TJH<J$4K
z!GymVm+NtWXFCX$5e!Ltpw5<OT489749tu2Ba}qrHdJ$6p2Tk^s)CvsJr$c&qfI}i
z>ul(IKJ&UDYn)C87s3|=Hupd7Y^w0f&8Jo?6UqcRX~^WH=I90!f{~JgYExZ3cP$wQ
z?5qZA5QL)@PwgmA((yM(YNdo<;^S>HRKrk3%8!{xgGshp@{@@V3Yc_06h}@_J3jJ*
z#Mufw^T3MZD#=Lh_FcO#;p;Gi{*DS!NsO5eRh)#rJRc@%TqbdamK!N~!SWLIu`mq6
zc0N1w+6#I;6&iNv#yBA8wYx_iBtIdq@m%f6z7<J7V3i^Nt>?@*l2ajLpeDw|ypn34
z4g869X~u9eLgj`u4|ZSK+a~wY=HKyz+?TSw&V0)vg$N4{MieSL_C|tOZKc2RbeAWQ
z3DE*Luw37C{b3SE)U_YPP>{GIKW<@FrING{2z!DGYIb0~xoo*I$=lHru6tbJ4oN<G
z4&9ilYI$O+<3QIuNt~j#6_>_ndkIcrrXJ)zs1D|8pyGZ|n19yknM)oNf98800<oCS
zEj=x?&<UzZ7Y}2hCKyicRdb6(G0%)(av8;{29)0y+!6>Bp7Lr59A10LFfr9`Y?^I~
zRLr1jLhC(R^`nklUfxD1(Th?66g+nJB41`^N^*m&Ft59iI5t?_@<uF#{IS4NUbvk9
zt#Xrd4iA=ABwHHu&+p;V$QvaQ4&jnYIxtviC2;^BME<}VX-$SH$5;v~-Jpenoj-J5
z`id197RA~s%Ynj4_E@yKqz8WG2@m?4^!n)lcnr!(hY3H4#ZmJ4VRCg*940o73X8O)
zOt^fIxXsitb1ezS#k_%ZKiFk+V!!QTyddhsF=YO{uol9-(prf2Gz1sb!j(Y~#WMBz
zSkcMn!%^>m+kC|&*};8-ftGGMu^_<OVg218@f}pmqFzK6D%694Xe81A6yTn&9TG(p
zFq7xNkU)7w8wdTH)>t>ny$dzxw(Nm)IlAMrf?^2>{uAyHDGQV!U7*+#AzF2?Vx>7s
zw!%fS=i=zNwAPUbrK<>;E|wE8N1?%kyt1@S)3*l7G?#>M2g4vFuZ+{)sfm@RazaNs
zq8kPSnVu#$^l5T~#}yefTT0zidnN@sc%Ic{wW+P0<rjt(+I=^M)+Cq=mZa|GB#F0f
zsKGEcwA3IFecnCmaT7XJW@4<#V@YXg3d;XqfS7OS{Qrc`Jzqq={|s;mI0pP0dHhpA
z2sps4!1KuCzXEIn2Z861yMGh744eY?0e?fj{#D=-@M+*q;Gf9RzXUu9yc75(a&iT{
z3wQzf_jMov9srgA%E^C*Jp3u(i+}=-0!zTpk%vD4P+q<tco}*4lfW);5|F^}k%vDG
zJO&&C?g9RYeEeyk2fPFLDRS_~01tRKK=Sz0{9Jr?rvCbREA^$rx%rvN*v(VJ*1XV`
zlO@>c{2*G7gKy={hT<Dur6{B09fkY}VO74Q8(t6$sgg;{hbXqs{?F(0Vw%6OH@O53
zE>`Fu`YY7oI1a;@PubX3qo`e2?P*Z7K~kg!N;-k!5#-4nRc^@>4psw7>k&r7|59r|
z@K@9oFq|(F6m2dh`C?zhZ2D}FD=iF-=;@mI6jk?v{tsG_cSrnZS3dtFfcy!=Tv!|C
z&sR|xRZYkzt{|QAbj|MqBEPmLGPdYmj)|${2zl_8EX}MgkF;&TO2~`8Ax|KxS}A0E
zdS%FVf!Hz)<A@;I*_E!VhU<5OJXVIDFTeo;l5+O7K$}JULQ7(5Gg^`o8SHOzIx`l<
zRuWU>68PQ-5e-)SfGjgNgD8zDiAf5xFw$<~+iW=aj~v?$l@+V$rh+UAkw@AqrE6Kw
zf=EsnGAUkXI;l=(K-`v-%>_1txH5H?T_aa{jzm({q5akBhH6KuD{*Kc_vi*?RU&=m
zc6kL-+)3>~kcf<*;%Ad1pmblhtk-42G_>Vqw!1W|$&-a_)NG;HwHb~O&<sbs*I{ts
zQnhj+P+VZgF}3$fiY2nH={}D~SYbIrng|eclOSp>iO@V8A?C+c3OT_J<VdLqJ*WHc
ztYT0`im5FGauy=_WGjs*1<}Z^mQHsDmQR(k*1VTP+N?*O#lo<MY<6_qOnZBVd{`l0
zpiCa=e44vB3gmX(xG%D<=w5J+b;<)d)D@RVwB$MNsorWxqljsqTkKinsZppdCvJ#b
z23t`xg<a~cs%f-u1S4kQPfF*S`+>E+)ocD%?fz=Cw=S>M@wc~#GbHnI>+PGER(>*|
zeNI(tl@WZ!n6GKWgY?Iny_&PkXX}ju%phOdTRUni>0u-7Y{1htV~MC=8L$YH<y(HD
zuKxRPICTQA|9<mYzui<vJ$uiDjbLV#s!v*MXN4&*RJGc}f9Ca4hnkynnO>dDU)%SU
zmOM?7riBk)iVb^ndN!%I9CuxOeJkzkWLwWl;(MH64c0%^snb#B)Xz4xtWgBC=qH<8
zX8+sIH5?76afbffg57L<fGo!h8S`~)kX3Mb7G+evD2!!S{IEmYJ%hxHU3^(ezj$Wt
P$JMFV*_1oH;U@kK?b^56


From dd453651806fc50b4cf5c3d490973e178ba196c8 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Fri, 10 Nov 2023 12:50:25 +0100
Subject: [PATCH 05/14] reformat generator.py for a black coding style

---
 zigzag/classes/opt/spatial/generator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/zigzag/classes/opt/spatial/generator.py b/zigzag/classes/opt/spatial/generator.py
index 3e5fe140..5dfcd9f8 100644
--- a/zigzag/classes/opt/spatial/generator.py
+++ b/zigzag/classes/opt/spatial/generator.py
@@ -186,7 +186,7 @@ def generate_user_spatial_mappings(
                 layer_dim: layer_size
                 for layer_dim, layer_size in self.layer.loop_dim_size.items()
             }
-            check_passed = True # initialization
+            check_passed = True  # initialization
             for unrolling_in_combination in combination:
                 if unrolling_in_combination is None:
                     continue
@@ -231,7 +231,9 @@ def generate_user_spatial_mappings(
         # If yield_count==0, it means there is no legal spatial mapping found.
         # The reason is that the spatial mapping provided by the user has exceeded the layer dim size,
         # therefore the loop cannot pass the check.
-        assert yield_count > 0, "There is no legal spatial mapping found. Please make sure the provided spatial mappings do not exceed the layer dimension size."
+        assert (
+            yield_count > 0
+        ), "There is no legal spatial mapping found. Please make sure the provided spatial mappings do not exceed the layer dimension size."
 
     def append_mix_spatial_unrollings(
         self, provided_oa_dim_unrollings, provided_oa_dim_unrolling_hints, oa_dim

From b9738e1b09df26540fa0bcfd3f546795439a15e8 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Fri, 10 Nov 2023 20:14:22 +0100
Subject: [PATCH 06/14] New feature: supporting ox/oy diagonal mapping. Fix:
 spatial_mapping_int in cost_model.py is fetched from SpatialGeneratorStage
 now to avoid the mapping inconsistence.

---
 zigzag/classes/cost_model/cost_model.py       |   4 +-
 zigzag/classes/opt/spatial/generator.py       | 410 ++++++++++++++++--
 zigzag/classes/stages/CostModelStage.py       |   4 +
 .../stages/SpatialMappingConversionStage.py   | 124 +++---
 .../stages/SpatialMappingGeneratorStage.py    | 180 +++++++-
 5 files changed, 610 insertions(+), 112 deletions(-)

diff --git a/zigzag/classes/cost_model/cost_model.py b/zigzag/classes/cost_model/cost_model.py
index cf25622a..d4296170 100644
--- a/zigzag/classes/cost_model/cost_model.py
+++ b/zigzag/classes/cost_model/cost_model.py
@@ -211,12 +211,14 @@ def __init__(
         accelerator,
         layer,
         spatial_mapping,
+        spatial_mapping_int,
         temporal_mapping,
         access_same_data_considered_as_no_access=True,
     ):
         self.accelerator = accelerator
         self.layer = layer
         self.spatial_mapping = spatial_mapping
+        self.spatial_mapping_int = spatial_mapping_int  # the original spatial mapping without decimal
         self.temporal_mapping = temporal_mapping
         self.access_same_data_considered_as_no_access = (
             access_same_data_considered_as_no_access
@@ -249,7 +251,7 @@ def __init__(
         # self.spatial_mapping_dict_int = spatial_mapping_fractional_to_int(
         #     self.spatial_mapping.mapping_dict_origin
         # )
-        self.spatial_mapping_dict_int = self.spatial_mapping.mapping_dict_origin
+        self.spatial_mapping_dict_int = self.spatial_mapping_int
 
         # For constructing Mapping object,  the last parameter "self.access_same_data_considered_as_no_access" is optional
         self.mapping = Mapping(
diff --git a/zigzag/classes/opt/spatial/generator.py b/zigzag/classes/opt/spatial/generator.py
index 5dfcd9f8..c3e35582 100644
--- a/zigzag/classes/opt/spatial/generator.py
+++ b/zigzag/classes/opt/spatial/generator.py
@@ -6,6 +6,8 @@
 from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy
 from zigzag.classes.hardware.architecture.operational_array import OperationalArray
 
+import math
+
 
 ## Class that generates valid user-format spatial mappings.
 class UserSpatialMappingGenerator:
@@ -19,6 +21,7 @@ def __init__(
         defined_mapping=None,
         enable_mix_spatial_mapping_generation=False,
         maximize_hardware_utilization=True,
+        enable_weight_diagonal_mapping=False,
     ) -> None:
         self.layer = layer
         self.accelerator = accelerator
@@ -27,11 +30,13 @@ def __init__(
             enable_mix_spatial_mapping_generation
         )
         self.maximize_hardware_utilization = maximize_hardware_utilization
+        self.enable_weight_diagonal_mapping = enable_weight_diagonal_mapping
 
     def run(self):
         return self.generate_user_spatial_mappings(
             enable_mix_spatial_mapping_generation=self.enable_mix_spatial_mapping_generation,
             maximize_hardware_utilization=self.maximize_hardware_utilization,
+            enable_weight_diagonal_mapping=self.enable_weight_diagonal_mapping,
         )
 
     ## Generator that yields user-defined spatial mappings.
@@ -51,7 +56,10 @@ def run(self):
     #                   layer_dim can be unrolled if the BW allows it (assumes flexible "bus" reads)
     # \endcode
     def generate_user_spatial_mappings(
-        self, enable_mix_spatial_mapping_generation, maximize_hardware_utilization
+        self,
+        enable_mix_spatial_mapping_generation,
+        maximize_hardware_utilization,
+        enable_weight_diagonal_mapping,
     ):
         core_id = self.layer.core_allocation
         core: Core = self.accelerator.get_core(core_id=core_id)
@@ -179,53 +187,30 @@ def generate_user_spatial_mappings(
                 # 2 means: only check the top 2 spatial mapping with the highest hardware utilization
                 # Modify "2" to other numbers if you want to check on more spatial mappings.
                 break
+
+            legal_spatial_loop, left_layer_dim_size = self.check_spatial_loop_legality(
+                combination=combination, layer=self.layer
+            )
+            if not legal_spatial_loop:
+                continue
             # Zip the combination (which is a (layer_dim, layer_size) for each oa_dim with the oa_dim names.
             oa_dim_names = [oa_dim.name for oa_dim in oa_dims]
-            # Extra check on the total unrolling size of a layer dim, if it is mapped on >=2 dimensions.
-            combination_check = {
-                layer_dim: layer_size
-                for layer_dim, layer_size in self.layer.loop_dim_size.items()
-            }
-            check_passed = True  # initialization
-            for unrolling_in_combination in combination:
-                if unrolling_in_combination is None:
-                    continue
-                if self.is_nested_tuple(unrolling_in_combination):
-                    for sub_unrolling_in_combination in unrolling_in_combination:
-                        unrolling_layer_dim = sub_unrolling_in_combination[0]
-                        unrolling_layer_size = sub_unrolling_in_combination[1]
-                        if unrolling_layer_dim in combination_check.keys():
-                            combination_check[
-                                unrolling_layer_dim
-                            ] /= unrolling_layer_size
-                        else:
-                            # The unrolled layer dim does not exist in current layer.
-                            # This only happens when the spatial mapping is user-defined, which
-                            # contains non-existent layer dims in current layer.
-                            pass
-                else:
-                    unrolling_layer_dim = unrolling_in_combination[0]
-                    unrolling_layer_size = unrolling_in_combination[1]
-                    if unrolling_layer_dim in combination_check.keys():
-                        combination_check[unrolling_layer_dim] /= unrolling_layer_size
-                    else:
-                        # The unrolled layer dim does not exist in current layer.
-                        # This only happens when the spatial mapping is user-defined, which
-                        # contains non-existent layer dims in current layer.
-                        pass
-            for layer_dim, layer_size in combination_check.items():
-                if layer_size < 1:  # the layer size/the unrolling size < 1
-                    # It means the unrolling size > the layer size, which is incorrect and impossible.
-                    check_passed = False
-                    break
-            if not check_passed:
-                continue
 
             user_spatial_mapping = {
                 oa_dim_name: unrolling
                 for (oa_dim_name, unrolling) in zip(oa_dim_names, combination)
                 if unrolling is not None
             }
+            # Add act ir loop if it is weight stationary and the innermost memories serve for act.
+            if enable_weight_diagonal_mapping:
+                user_spatial_mapping = self.add_input_pr_spatial_loop_if_enabled(
+                    layer=self.layer,
+                    provided_user_spatial_mapping=user_spatial_mapping,
+                    user_spatial_mapping_hint=user_spatial_mapping_hint,
+                    innermost_levels=innermost_levels,
+                    left_layer_dim_size=left_layer_dim_size,
+                    enable_mix_spatial_mapping_generation=enable_mix_spatial_mapping_generation,
+                )
             yield user_spatial_mapping
             yield_count += 1
         # If yield_count==0, it means there is no legal spatial mapping found.
@@ -235,13 +220,49 @@ def generate_user_spatial_mappings(
             yield_count > 0
         ), "There is no legal spatial mapping found. Please make sure the provided spatial mappings do not exceed the layer dimension size."
 
+    def check_spatial_loop_legality(self, combination, layer):
+        # Extra check on the total unrolling size of a layer dim, if it is mapped on >=2 dimensions.
+        combination_check = {
+            layer_dim: layer_size
+            for layer_dim, layer_size in layer.loop_dim_size.items()
+        }
+        legal_spatial_loop = True  # initialization
+        for unrolling_in_combination in combination:
+            if unrolling_in_combination is None:
+                continue
+            if self.is_nested_tuple(unrolling_in_combination):
+                for sub_unrolling_in_combination in unrolling_in_combination:
+                    unrolling_layer_dim = sub_unrolling_in_combination[0]
+                    unrolling_layer_size = sub_unrolling_in_combination[1]
+                    if unrolling_layer_dim in combination_check.keys():
+                        combination_check[unrolling_layer_dim] /= unrolling_layer_size
+                    else:
+                        # The unrolled layer dim does not exist in current layer.
+                        # This only happens when the spatial mapping is user-defined, which
+                        # contains non-existent layer dims in current layer.
+                        pass
+            else:
+                unrolling_layer_dim = unrolling_in_combination[0]
+                unrolling_layer_size = unrolling_in_combination[1]
+                if unrolling_layer_dim in combination_check.keys():
+                    combination_check[unrolling_layer_dim] /= unrolling_layer_size
+                else:
+                    # The unrolled layer dim does not exist in current layer.
+                    # This only happens when the spatial mapping is user-defined, which
+                    # contains non-existent layer dims in current layer.
+                    pass
+        for layer_dim, layer_size in combination_check.items():
+            if layer_size < 1:  # the layer size/the unrolling size < 1
+                # It means the unrolling size > the layer size, which is incorrect and impossible.
+                legal_spatial_loop = False
+                break
+        return legal_spatial_loop, combination_check
+
     def append_mix_spatial_unrollings(
         self, provided_oa_dim_unrollings, provided_oa_dim_unrolling_hints, oa_dim
     ):
         # Create and append new mix spatial unrollings to original oa_dim_unrollings
         # An example of mix: (("K",2), ("OX", 2))
-        import math
-
         oa_dim_unrollings = provided_oa_dim_unrollings
         oa_dim_unrolling_hints = provided_oa_dim_unrolling_hints
         if (
@@ -387,8 +408,6 @@ def sort_oa_dim_unrollings_in_the_order_of_utilization(
         # @param descending:
         #                 True -- the higher the mapping utilization is, the closer to the front it is.
         #                 False -- the lower the mapping utilization is, the closer to the front it is.
-        import math
-
         oa_dim_unrollings = provided_oa_dim_unrollings
         if len(oa_dim_unrollings) > 1:
             # First we will record down the hardware utilization of each spatial unrolling in comb_value
@@ -426,6 +445,309 @@ def sort_oa_dim_unrollings_in_the_order_of_utilization(
             hardware_utilization = None
         return oa_dim_unrollings, hardware_utilization
 
+    def add_input_pr_spatial_loop_if_enabled(
+        self,
+        layer,
+        provided_user_spatial_mapping,
+        user_spatial_mapping_hint,
+        innermost_levels,
+        left_layer_dim_size,
+        enable_mix_spatial_mapping_generation,
+    ):
+        # This function is used to support diagonal spatial mapping
+        # when input/activation is served in the innermost memories and the weight is stationary.
+        user_spatial_mapping = provided_user_spatial_mapping
+        # get the link from layer op to mem op
+        layer_op_to_mem_op: dict = layer.memory_operand_links
+        # check if it is weight stationary.
+        # keep the spatial loop as it was if it is not weight stationary.
+        if len(layer.constant_operands) > 1:
+            return user_spatial_mapping
+        # get weight operand name
+        const_operand = layer.constant_operands[0]  # weight representation
+        # get activation operand name
+        act_operand = [
+            operand for operand in layer.input_operands if operand != const_operand
+        ][0]
+        # get output operand name
+        output_operand = layer.output_operand
+        # get name of OX, OY (weight ir layer dims)
+        weight_ir_layer_dims: list = layer.operand_loop_dim[const_operand]["ir"]
+        # get the oa_dim name served by input / output innermost memory level
+        for memory_level in innermost_levels:
+            mem_ops = memory_level.operands
+            if layer_op_to_mem_op[act_operand] in mem_ops:
+                act_served_oa_dim: set = memory_level.served_dimensions
+            if layer_op_to_mem_op[output_operand] in mem_ops:
+                output_served_oa_dim: set = memory_level.served_dimensions
+        # check if act is not served in the innermost memories, or it is uti-casting for act.
+        # keep the spatial loop as it was if act is not served.
+        if "act_served_oa_dim" not in locals() or len(act_served_oa_dim) == 0:
+            return user_spatial_mapping
+
+        act_served_oa_dim_name = list(act_served_oa_dim)[0].name
+        output_served_oa_dim_name = list(output_served_oa_dim)[0].name
+        act_served_oa_dim_size = list(act_served_oa_dim)[0].size
+        output_served_oa_dim_size = list(output_served_oa_dim)[0].size
+
+        # check if OX / OY in user_spatial_mapping_hint. Or else target_layer_dim will be empty.
+        target_layer_dim = []  # OX or OY or both
+        for layer_dim in weight_ir_layer_dims:
+            if layer_dim in user_spatial_mapping_hint[act_served_oa_dim_name]:
+                target_layer_dim.append(layer_dim)
+
+        # no further execution if OX / OY unrolling is not in user_spatial_mapping_hint
+        if len(target_layer_dim) == 0:
+            return user_spatial_mapping
+
+        ############################################
+        # Get existed mapping size on act_served_oa_dim, which will be added with OX, OY later.
+        if (
+            act_served_oa_dim_name in user_spatial_mapping.keys()
+        ):  # there already is sm loop
+            sm_loop = user_spatial_mapping[act_served_oa_dim_name]
+            if self.is_nested_tuple(sm_loop):  # a mix layer sm mapping
+                exist_act_loop_size = 1
+                for element in sm_loop:
+                    exist_act_loop_size *= element[1]
+            else:  # a single layer sm mapping
+                exist_act_loop_size = sm_loop[1]
+        else:  # there is no sm loop mapped on act served dim
+            exist_act_loop_size = 1
+
+        # Check if the existed mapping size is more than half of current oa dim size.
+        # If so, it means there is no space for extra mapping even with a size of 2.
+        # In that case, we will do nothing but return the orignal spatial mapping
+        if exist_act_loop_size * 2 > act_served_oa_dim_size:
+            return user_spatial_mapping
+
+        # fetch pr loop pairs for activation, e.g. {"IX": ["OX", "FX"]}
+        act_pr_layer_dims: dict = layer.operand_loop_dim[act_operand]["pr"]
+
+        # Next we get existed mapping size on output_served_oa_dim
+        # there are two classes of mapping:
+        # (1) ir mapping to weight, e.g. "C"
+        # (2) r mapping to weight, e.g. "FX", "FY" (kernel size)
+
+        # We firstly create a dict for later recording down existed r mapping to weight
+        # it will be like:
+        # weight_r_loop = {"OX": {"FX": 1}, "OY": {"FY": 1}}
+        weight_r_loop: dict = {}  # here we put a nested dict for recording
+        loops_name_for_kernel_size: list = []
+        pr_sm_link: dict = (
+            {}
+        )  # here we record down the link between pr loops, e.g. link["FX"]="OX"
+
+        for weight_ir_layer_dim in weight_ir_layer_dims:
+            for [layer_dim1, layer_dim2] in act_pr_layer_dims.values():
+                if weight_ir_layer_dim in [layer_dim1, layer_dim2]:
+                    break
+            # as we are unsure in act_pr_layer_dims, it is [OX, FX] or [FX, OX], we consider two possibilities.
+            if layer_dim1 == weight_ir_layer_dim:  # if the first one is OX / OY
+                weight_r_loop[layer_dim1] = {layer_dim2: 1}  # 1 by default
+                loops_name_for_kernel_size.append(layer_dim2)
+                pr_sm_link[layer_dim2] = layer_dim1
+            else:  # layer_dim2 == weight_ir_layer_dim, the second one is OX / OY
+                weight_r_loop[layer_dim2] = {layer_dim1: 1}  # 1 by default
+                loops_name_for_kernel_size.append(layer_dim1)
+                pr_sm_link[layer_dim1] = layer_dim2
+
+        # Next we will update the dict, and also find the mapping size (weight ir loop size) we do not care out.
+        weight_ir_loop_size = 1  # default value
+        sm_loop = user_spatial_mapping[output_served_oa_dim_name]
+        if self.is_nested_tuple(sm_loop):  # a mix sm mapping
+            for element in sm_loop:
+                # save operation as above
+                layer_dim = element[0]
+                mapping_size = element[1]
+                if layer_dim in loops_name_for_kernel_size:  # layer_dim in ["FX", "FY"]
+                    paired_pr_layer_dim = pr_sm_link[
+                        layer_dim
+                    ]  # "FX" -> "OX", "FY" -> "OY"
+                    weight_r_loop[paired_pr_layer_dim][layer_dim] *= mapping_size
+                else:  # not care
+                    weight_ir_loop_size *= mapping_size
+        else:  # a single layer sm mapping
+            layer_dim = sm_loop[0]
+            mapping_size = sm_loop[1]
+            if layer_dim in loops_name_for_kernel_size:  # layer_dim in ["FX", "FY"]
+                paired_pr_layer_dim = pr_sm_link[
+                    layer_dim
+                ]  # "FX" -> "OX", "FY" -> "OY"
+                weight_r_loop[paired_pr_layer_dim][layer_dim] *= mapping_size
+            else:  # not care
+                weight_ir_loop_size *= mapping_size
+
+        # At this point, we already know what sm mapping existed.
+        ############################################
+
+        # Next we will try to add possible OX / OY mapping
+        # find all possible OX / OY mapping breakdown and put them in the pool
+        # It looks like:
+        # sm_pools = {"OX": [("OX",2),("OX",5),("OX",5)], "OY": [("OY",2),("OY",5),("OY",5)]}
+        sm_pools_to_add: dict = {}
+        for layer_dim in target_layer_dim:
+            layer_size = self.layer.loop_dim_size[layer_dim]
+            layer_size_breakdown: list = self.prime_factors(layer_size)
+
+            # try to find the maximum OX / OY and add it to the list
+            # (1) check on act_served_oa_dim (ceil down to integer)
+            max_allowed_dim_size_on_act_served_dim = math.floor(
+                act_served_oa_dim_size / exist_act_loop_size
+            )
+            # (2) check on output_served_oa_dim
+            existed_pr_mapping = list(weight_r_loop[layer_dim].values())[0]
+            for key in weight_r_loop.keys():
+                if key != layer_dim:
+                    ir_layer_dim_to_current_layer_dim = key
+            existed_pr_mapping_but_ir_to_current_layer_dim = list(
+                weight_r_loop[ir_layer_dim_to_current_layer_dim].values()
+            )[0]
+            max_allowed_dim_size_on_output_served_dim = (
+                output_served_oa_dim_size
+                / weight_ir_loop_size
+                / existed_pr_mapping_but_ir_to_current_layer_dim
+            ) - (existed_pr_mapping - 1)
+            # ceil down to integer
+            max_allowed_dim_size_on_output_served_dim = math.floor(
+                max_allowed_dim_size_on_output_served_dim
+            )
+            max_allowed_target_dim_size = min(
+                max_allowed_dim_size_on_act_served_dim,
+                max_allowed_dim_size_on_output_served_dim,
+            )
+            # check whether the element in layer_size_breakdown is allowed to add
+            legal_layer_size_breakdown = []
+            for factor in layer_size_breakdown:
+                if (
+                    factor <= max_allowed_target_dim_size
+                    and factor <= left_layer_dim_size[layer_dim]
+                ):
+                    legal_layer_size_breakdown.append(factor)
+            if len(legal_layer_size_breakdown) > 0:
+                sm_pools_to_add[layer_dim] = [
+                    tuple([layer_dim, size]) for size in legal_layer_size_breakdown
+                ]
+
+        # check if there is anything in the pool
+        if len(sm_pools_to_add) == 0:
+            return user_spatial_mapping
+
+        # Generate possible combination
+        # In the for loop below, we will first try only with OX or OY. Then with their combination.
+        # In the end, we will only keep the best one, which has the maximal value of OX*OY.
+        # If there are multiple combs having the same OX*OY, we will keep the first one, as their cost are the same.
+        best_comb = []  # list initialization
+        best_comb_size = 0  # reference value to find the best comb
+        target_layer_dim = [
+            layer_dim
+            for layer_dim in target_layer_dim
+            if layer_dim in sm_pools_to_add.keys()
+        ]
+        if enable_mix_spatial_mapping_generation:
+            allowed_dim_comb_length = len(target_layer_dim)
+        else:
+            allowed_dim_comb_length = 1
+        for dim_comb_length in range(1, allowed_dim_comb_length + 1):
+            for dim_comb in itertools.combinations(target_layer_dim, dim_comb_length):
+                # we will create a temporal pools for each dim combination
+                sm_pools_mix = []
+                for layer_dim in dim_comb:
+                    sm_pools_mix += sm_pools_to_add[layer_dim]
+                max_comb_length = len(
+                    sm_pools_mix
+                )  # the max possible length of combination
+                for comb_length in range(1, max_comb_length + 1):
+                    for comb in itertools.combinations(sm_pools_mix, comb_length):
+                        # At this point, in comb, we have a possible OX / OY mapping
+                        # First we get current comb size
+                        # Example: comb_mapping = {"OX": 5, "OY", 10}
+                        comb_mapping: dict = {}
+                        for layer_dim in dim_comb:
+                            comb_mapping[layer_dim] = 1  # default value
+                        for element in comb:
+                            layer_dim = element[0]
+                            mapping_size = element[1]
+                            comb_mapping[layer_dim] *= mapping_size
+                        # Skip if current unrolling on a layer_dim is 1, which means it has been checked already.
+                        curr_comb_already_checked = False
+                        for unroll_size in comb_mapping.values():
+                            if unroll_size == 1:
+                                curr_comb_already_checked = True
+                                break
+                        if curr_comb_already_checked:
+                            continue
+                        # We will check if this comb is possible
+                        # (1) check on left_layer_dim_size
+                        curr_comb_illegal = False
+                        for unroll_dim, unroll_size in comb_mapping.items():
+                            if unroll_size > left_layer_dim_size[unroll_dim]:
+                                curr_comb_illegal = True
+                                break
+                        if curr_comb_illegal:
+                            continue
+                        # (2) check on act_served_oa_dim
+                        comb_size = math.prod([v for v in comb_mapping.values()])
+                        required_oa_dim_size = exist_act_loop_size * comb_size
+                        if required_oa_dim_size > act_served_oa_dim_size:
+                            continue  # the comb is not possible on act_served_oa_dim
+                        # (3) check on output_served_oa_dim
+                        required_oa_dim_size = weight_ir_loop_size
+                        for layer_dim in comb_mapping.keys():
+                            existed_pr_mapping = list(
+                                weight_r_loop[layer_dim].values()
+                            )[0]
+                            pr_mapping_to_add = comb_mapping[layer_dim]
+                            new_mapping_size = (
+                                existed_pr_mapping + pr_mapping_to_add - 1
+                            )
+                            required_oa_dim_size *= new_mapping_size
+                        if len(comb_mapping) == 1:  # only OX or OY
+                            # add the other existed pr loop to required_oa_dim_size,
+                            # because previously it is not counted in output_served_oa_dim_size.
+                            sole_dim = list(comb_mapping.keys())[0]
+                            the_other_pr_mapping_name = [
+                                key for key in weight_r_loop.keys() if key != sole_dim
+                            ][0]
+                            the_other_pr_mapping_size = list(
+                                weight_r_loop[the_other_pr_mapping_name].values()
+                            )[0]
+                            required_oa_dim_size *= the_other_pr_mapping_size
+                        if required_oa_dim_size > output_served_oa_dim_size:
+                            continue  # this comb is not possible on output_served_oa_dim
+                        # (4) compare with best_comb
+                        if comb_size > best_comb_size:
+                            # reformat the comb and merge repetitive elements
+                            # example: (("OX", 5), ("OY", 2))
+                            new_comb: list = [
+                                (layer_dim, mapping_size)
+                                for (layer_dim, mapping_size) in comb_mapping.items()
+                            ]
+                            best_comb = new_comb
+
+        # At this point, we get the best possible comb to add. Then we can add that to the current sm mapping
+        if len(best_comb) == 0:  # did not find any comb
+            return user_spatial_mapping
+        else:
+            if (
+                act_served_oa_dim_name in user_spatial_mapping.keys()
+            ):  # there already is sm loop previously
+                act_served_mapping_to_change = user_spatial_mapping[
+                    act_served_oa_dim_name
+                ]
+                if self.is_nested_tuple(
+                    act_served_mapping_to_change
+                ):  # originally it is a mix mapping
+                    reformed_sm = list(act_served_mapping_to_change) + best_comb
+                else:  # originally it is a single layer mapping
+                    reformed_sm = [act_served_mapping_to_change] + best_comb
+            else:  # there is no sm loop on act served oa dim previously
+                reformed_sm = best_comb
+            reformed_sm = tuple(reformed_sm)
+            user_spatial_mapping[act_served_oa_dim_name] = reformed_sm
+
+        return user_spatial_mapping
+
     @staticmethod
     def all_unique(items):
         return len(set(items)) == len(items)
diff --git a/zigzag/classes/stages/CostModelStage.py b/zigzag/classes/stages/CostModelStage.py
index 941c7db0..ce2d135b 100644
--- a/zigzag/classes/stages/CostModelStage.py
+++ b/zigzag/classes/stages/CostModelStage.py
@@ -30,6 +30,7 @@ def __init__(
         accelerator,
         layer,
         spatial_mapping,
+        spatial_mapping_int,
         temporal_mapping,
         access_same_data_considered_as_no_access=True,
         **kwargs
@@ -39,12 +40,14 @@ def __init__(
             self.accelerator,
             self.layer,
             self.spatial_mapping,
+            self.spatial_mapping_int,
             self.temporal_mapping,
             self.access_same_data_considered_as_no_access,
         ) = (
             accelerator,
             layer,
             spatial_mapping,
+            spatial_mapping_int,
             temporal_mapping,
             access_same_data_considered_as_no_access,
         )
@@ -55,6 +58,7 @@ def run(self) -> Generator[Tuple[CostModelEvaluation, Any], None, None]:
             accelerator=self.accelerator,
             layer=self.layer,
             spatial_mapping=self.spatial_mapping,
+            spatial_mapping_int=self.spatial_mapping_int,
             temporal_mapping=self.temporal_mapping,
             # the below parameter is optional
             access_same_data_considered_as_no_access=self.access_same_data_considered_as_no_access,
diff --git a/zigzag/classes/stages/SpatialMappingConversionStage.py b/zigzag/classes/stages/SpatialMappingConversionStage.py
index 44cbf89f..6f89c9fc 100644
--- a/zigzag/classes/stages/SpatialMappingConversionStage.py
+++ b/zigzag/classes/stages/SpatialMappingConversionStage.py
@@ -52,7 +52,7 @@ def is_nested_tuple(obj):
 
     def run(self):
         user_spatial_mapping = self.layer.user_spatial_mapping
-        spatial_mapping = self.convert_user_spatial_mapping(user_spatial_mapping)
+        spatial_mapping, spatial_mapping_int = self.convert_user_spatial_mapping(user_spatial_mapping)
         # Since the spatial_mapping may be modified in the previous step,
         # we have to update this change to self.layer
         updated_user_spatial_mapping = {}
@@ -75,6 +75,7 @@ def run(self):
 
         kwargs = self.kwargs.copy()
         kwargs["spatial_mapping"] = spatial_mapping
+        kwargs["spatial_mapping_int"] = spatial_mapping_int
         kwargs["accelerator"] = self.accelerator
         kwargs["layer"] = self.layer
 
@@ -159,15 +160,75 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
             f"User-provided spatial mapping converted to: {user_spatial_mapping_for_log}"
         )
 
+        spatial_mapping_dict = self.generate_spatial_mapping_dict(
+            user_spatial_mapping=limited_user_spatial_mapping,
+            layer=self.layer,
+            accelerator=self.accelerator
+        )
+        # The next spatial_mapping_dict is used in cost model to calculate the interval between different data transfer.
+        # Different with the one above, there must only be integer numbers (corresponding to the real cases)
+        spatial_mapping_dict_int = self.generate_spatial_mapping_dict(
+            user_spatial_mapping=user_spatial_mapping,
+            layer=self.layer,
+            accelerator=self.accelerator
+        )
+
+        return SpatialMapping(
+            spatial_mapping_dict=spatial_mapping_dict, layer_node=self.layer
+        ), SpatialMapping(
+            spatial_mapping_dict=spatial_mapping_dict_int, layer_node=self.layer
+        )
+
+    def generate_limited_user_spatial_mapping(
+        self, layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop, user_spatial_mapping
+    ):
+        ## Do check on spatial mapping, and convert the mapping to a tuple
+        (loop_dim_unrolled, loop_size_unrolled) = spatial_loop
+        # Check 0: Skip this spatial dimension if it doesn't exist in the layer
+        if loop_dim_unrolled not in layer_dim_sizes.keys():
+            return None
+        # Check 1: Limit unrolling if operational array dimension is smaller than provided unrolling
+        oa_dim_size = next(
+            (oa_dim for oa_dim in oa_dims if oa_dim.name == oa_dim_name)
+        ).size
+        loop_size_unrolled = min(oa_dim_size, loop_size_unrolled)
+        # Check 2: Limit unrolling if layer dimension is smaller than provided unrolling or if the loop dim doesn't exist
+        layer_dim_size = layer_dim_sizes.get(loop_dim_unrolled, 1)
+        loop_size_unrolled = min(layer_dim_size, loop_size_unrolled)
+        # Check 3: Adjust unrolling if it is not a multiple of the layer dimension size
+        # and if there is no more mapping for this layer dimension
+        no_more_mapping_for_current_layer_dim = self.check_if_there_is_further_oa_mapping_for_current_layer_dim(
+                oa_dim_name=oa_dim_name,
+                loop_dim_unrolled=loop_dim_unrolled,
+                user_spatial_mapping=user_spatial_mapping
+        )
+        if no_more_mapping_for_current_layer_dim:
+            loop_size_unrolled_on_early_oa_dims = self.calc_unrolled_loop_size_on_early_oa_dims(
+                oa_dim_name=oa_dim_name,
+                loop_dim_unrolled=loop_dim_unrolled,
+                user_spatial_mapping=user_spatial_mapping
+            )
+            temporal_remainder = int(np.ceil(layer_dim_size / (loop_size_unrolled*loop_size_unrolled_on_early_oa_dims)))
+            loop_size_unrolled = layer_dim_size / temporal_remainder / loop_size_unrolled_on_early_oa_dims
+        return (
+            loop_dim_unrolled,
+            loop_size_unrolled,
+        )
+
+    def generate_spatial_mapping_dict(
+        self, user_spatial_mapping, layer, accelerator
+    ):
+        # This function is to convert spatial mapping to spatial_mapping_dict,
+        # which attaches spatial mapping to different memory levels.
         spatial_mapping_dict = {}
-        layer_to_mem_op = self.layer.memory_operand_links
+        layer_to_mem_op = layer.memory_operand_links
         mem_to_layer_op = {
             mem_op: layer_op for (layer_op, mem_op) in layer_to_mem_op.items()
         }
-        core_id = self.layer.core_allocation
-        mem_hierarchy = self.accelerator.get_core(core_id).memory_hierarchy
+        core_id = layer.core_allocation
+        mem_hierarchy = accelerator.get_core(core_id).memory_hierarchy
         for mem_op, layer_op in mem_to_layer_op.items():
-            user_sm_copy = limited_user_spatial_mapping.copy()
+            user_sm_copy = user_spatial_mapping.copy()
             # layer_op = mem_to_layer_op[mem_op]
             spatial_mapping_dict[layer_op] = []
             memory_levels = mem_hierarchy.get_memory_levels(
@@ -192,8 +253,8 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                                     spatial_mapping_size,
                                 ) = sub_spatial_loop
                                 if (
-                                    spatial_mapping_dim
-                                    in spatial_mapping_lvl_dict.keys()
+                                        spatial_mapping_dim
+                                        in spatial_mapping_lvl_dict.keys()
                                 ):
                                     spatial_mapping_lvl_dict[
                                         spatial_mapping_dim
@@ -216,8 +277,8 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                         # as the spatial mapping representation is a level-by-level one.
                         del user_sm_copy[dim_name]
                 for (
-                    spatial_mapping_lvl_dict_dim,
-                    spatial_mapping_lvl_dict_size,
+                        spatial_mapping_lvl_dict_dim,
+                        spatial_mapping_lvl_dict_size,
                 ) in spatial_mapping_lvl_dict.items():
                     spatial_mapping_lvl.append(
                         (spatial_mapping_lvl_dict_dim, spatial_mapping_lvl_dict_size)
@@ -231,49 +292,10 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                 spatial_loop for (dim_name, spatial_loop) in user_sm_copy.items()
             ]
             spatial_mapping_dict[layer_op].append(top_level_spatial_mapping)
-
-        return SpatialMapping(
-            spatial_mapping_dict=spatial_mapping_dict, layer_node=self.layer
-        )
-
-    def generate_limited_user_spatial_mapping(
-        self, layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop, user_spatial_mapping
-    ):
-        ## Do check on spatial mapping, and convert the mapping to a tuple
-        (loop_dim_unrolled, loop_size_unrolled) = spatial_loop
-        # Check 0: Skip this spatial dimension if it doesn't exist in the layer
-        if loop_dim_unrolled not in layer_dim_sizes.keys():
-            return None
-        # Check 1: Limit unrolling if operational array dimension is smaller than provided unrolling
-        oa_dim_size = next(
-            (oa_dim for oa_dim in oa_dims if oa_dim.name == oa_dim_name)
-        ).size
-        loop_size_unrolled = min(oa_dim_size, loop_size_unrolled)
-        # Check 2: Limit unrolling if layer dimension is smaller than provided unrolling or if the loop dim doesn't exist
-        layer_dim_size = layer_dim_sizes.get(loop_dim_unrolled, 1)
-        loop_size_unrolled = min(layer_dim_size, loop_size_unrolled)
-        # Check 3: Adjust unrolling if it is not a multiple of the layer dimension size
-        # and if there is no more mapping for this layer dimension
-        no_more_mapping_for_current_layer_dim = self.check_if_there_is_further_oa_mapping_for_current_layer_dim(
-                oa_dim_name=oa_dim_name,
-                loop_dim_unrolled=loop_dim_unrolled,
-                user_spatial_mapping=user_spatial_mapping
-        )
-        if no_more_mapping_for_current_layer_dim:
-            loop_size_unrolled_on_early_oa_dims = self.calc_unrolled_loop_size_on_early_oa_dims(
-                oa_dim_name=oa_dim_name,
-                loop_dim_unrolled=loop_dim_unrolled,
-                user_spatial_mapping=user_spatial_mapping
-            )
-            temporal_remainder = int(np.ceil(layer_dim_size / (loop_size_unrolled*loop_size_unrolled_on_early_oa_dims)))
-            loop_size_unrolled = layer_dim_size / temporal_remainder / loop_size_unrolled_on_early_oa_dims
-        return (
-            loop_dim_unrolled,
-            loop_size_unrolled,
-        )
+        return spatial_mapping_dict
 
     def check_if_there_is_further_oa_mapping_for_current_layer_dim(
-            self, oa_dim_name, loop_dim_unrolled, user_spatial_mapping
+        self, oa_dim_name, loop_dim_unrolled, user_spatial_mapping
     ):
         # For the case when there is layer dimension that is mapped on multiple oa dimensions.
         # We need to decide on which oa dimension to adjust the unrolling
@@ -302,7 +324,7 @@ def check_if_there_is_further_oa_mapping_for_current_layer_dim(
         return no_more_mapping_for_current_layer_dim
 
     def calc_unrolled_loop_size_on_early_oa_dims(
-            self, oa_dim_name, loop_dim_unrolled, user_spatial_mapping
+        self, oa_dim_name, loop_dim_unrolled, user_spatial_mapping
     ):
         # calculate the unrolled loop size for the specific layer dim on oa dims earlier than current oa dim
         loop_unrolled_size_already = 1
diff --git a/zigzag/classes/stages/SpatialMappingGeneratorStage.py b/zigzag/classes/stages/SpatialMappingGeneratorStage.py
index 85b81719..8f7bf9fc 100644
--- a/zigzag/classes/stages/SpatialMappingGeneratorStage.py
+++ b/zigzag/classes/stages/SpatialMappingGeneratorStage.py
@@ -2,11 +2,14 @@
 
 from zigzag.classes.opt.spatial.generator import UserSpatialMappingGenerator
 from zigzag.classes.hardware.architecture.core import Core
+from zigzag.classes.hardware.architecture.accelerator import Accelerator
+from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy
 from zigzag.classes.stages.Stage import Stage
 from zigzag.classes.stages.SpatialMappingConversionStage import (
     SpatialMappingConversionStage,
 )
 import copy
+from zigzag.utils import pickle_deepcopy
 
 logger = logging.getLogger(__name__)
 
@@ -33,6 +36,7 @@ def __init__(
         layer,
         enable_mix_spatial_mapping_generation=False,
         maximize_hardware_utilization=True,
+        enable_weight_diagonal_mapping=False,
         **kwargs,
     ):
         super().__init__(list_of_callables, **kwargs)
@@ -43,6 +47,7 @@ def __init__(
             enable_mix_spatial_mapping_generation
         )
         self.maximize_hardware_utilization = maximize_hardware_utilization
+        self.enable_weight_diagonal_mapping = enable_weight_diagonal_mapping
 
     @staticmethod
     # Check that the layer includes:
@@ -74,9 +79,11 @@ def run(self):
             user_provided_spatial_mappings, dict
         ):  # There is a single USM provided
             if len(user_provided_spatial_mappings) < len(oa_dims):
-                self.layer.user_spatial_mapping_hint = self.complete_user_spatial_mapping_hint(
-                    user_spatial_mapping_hint=user_spatial_mapping_hint,
-                    oa_dims=oa_dims
+                self.layer.user_spatial_mapping_hint = (
+                    self.complete_user_spatial_mapping_hint(
+                        user_spatial_mapping_hint=user_spatial_mapping_hint,
+                        oa_dims=oa_dims,
+                    )
                 )
                 user_spatial_mapping_generator = UserSpatialMappingGenerator(
                     layer=self.layer,
@@ -96,9 +103,10 @@ def run(self):
         ):  # There are multiple USMs provided
             user_spatial_mappings = user_provided_spatial_mappings
         else:  # There is no USM provided
-            self.layer.user_spatial_mapping_hint = self.complete_user_spatial_mapping_hint(
-                user_spatial_mapping_hint=user_spatial_mapping_hint,
-                oa_dims=oa_dims
+            self.layer.user_spatial_mapping_hint = (
+                self.complete_user_spatial_mapping_hint(
+                    user_spatial_mapping_hint=user_spatial_mapping_hint, oa_dims=oa_dims
+                )
             )
             # Initialize the UserSpatialMappingGenerator which will automatically generate SMs
             user_spatial_mapping_generator = UserSpatialMappingGenerator(
@@ -106,6 +114,7 @@ def run(self):
                 accelerator=self.accelerator,
                 enable_mix_spatial_mapping_generation=self.enable_mix_spatial_mapping_generation,
                 maximize_hardware_utilization=self.maximize_hardware_utilization,
+                enable_weight_diagonal_mapping=self.enable_weight_diagonal_mapping,
             )
             # Get all the USMs by running the generator
             user_spatial_mappings = list(
@@ -122,18 +131,35 @@ def run(self):
             self.layer.user_spatial_mapping = user_spatial_mapping
             # Note: manual instantiation of spatial mapping conversion stage here. We let that class deal with
             # everything else, including instantion of the actual substages
-            spatial_mapping_conversion_stage = SpatialMappingConversionStage(
-                self.list_of_callables,
-                accelerator=self.accelerator,
-                layer=copy.copy(self.layer),
-                **self.kwargs,
-            )
+
+            # Modify the size of lower input mem to support weight diagonal spatial unrolling (for OX/OY)
+            if self.enable_weight_diagonal_mapping:
+                (
+                    input_mem_size_updated,
+                    new_accelerator,
+                ) = self.modify_innermost_input_mem_size(core_id, user_spatial_mapping)
+            if self.enable_weight_diagonal_mapping and input_mem_size_updated:
+                original_accelerator = self.accelerator
+                spatial_mapping_conversion_stage = SpatialMappingConversionStage(
+                    self.list_of_callables,
+                    accelerator=new_accelerator,
+                    layer=copy.copy(self.layer),
+                    **self.kwargs,
+                )
+            else:
+                spatial_mapping_conversion_stage = SpatialMappingConversionStage(
+                    self.list_of_callables,
+                    accelerator=self.accelerator,
+                    layer=copy.copy(self.layer),
+                    **self.kwargs,
+                )
             for cme, extra_info in spatial_mapping_conversion_stage.run():
+                if self.enable_weight_diagonal_mapping and input_mem_size_updated:
+                    # recover back the accelerator if its mem size is adjusted before
+                    cme.accelerator = original_accelerator
                 yield cme, (user_spatial_mapping, extra_info)
 
-    def complete_user_spatial_mapping_hint(
-            self, user_spatial_mapping_hint, oa_dims
-    ):
+    def complete_user_spatial_mapping_hint(self, user_spatial_mapping_hint, oa_dims):
         # This function is to create user_spatial_mapping_hint when it is not provided
         # or complete it if it is provided but on only part of oa dimensions.
         complete_user_spatial_mapping_hint = user_spatial_mapping_hint
@@ -156,4 +182,126 @@ def complete_user_spatial_mapping_hint(
                         layer_dim for layer_dim in self.layer.loop_dim_list
                     ]
             # self.layer.user_spatial_mapping_hint = user_spatial_mapping_hint
-        return complete_user_spatial_mapping_hint
\ No newline at end of file
+        return complete_user_spatial_mapping_hint
+
+    def modify_innermost_input_mem_size(self, core_id, user_spatial_mapping):
+        # To support OX, OY unrolling, we will scale the lowest input mem size by OXu*OYu
+        # to avoid the MemoryTooSmallException in loma stage.
+        input_mem_size_updated = (
+            False  # flag to indicate if the accelerator is modified.
+        )
+        core = self.accelerator.get_core(core_id=core_id)
+        operational_array = core.operational_array
+        oa_dims = operational_array.dimensions
+        memory_hierarchy = copy.deepcopy(core.memory_hierarchy)
+        innermost_levels = memory_hierarchy.get_inner_memories()
+        # get the link from layer op to mem op
+        layer_op_to_mem_op: dict = self.layer.memory_operand_links
+        # check if it is weight stationary.
+        # keep the spatial loop as it was if it is not weight stationary.
+        if len(self.layer.constant_operands) > 1:
+            return input_mem_size_updated, self.accelerator
+        # get weight operand name
+        const_operand = self.layer.constant_operands[0]  # weight representation
+        # get activation operand name
+        act_operand = [
+            operand for operand in self.layer.input_operands if operand != const_operand
+        ][0]
+        # get name of OX, OY (weight ir layer dims)
+        weight_ir_layer_dims: list = self.layer.operand_loop_dim[const_operand]["ir"]
+        # get the oa_dim name served by input innermost memory level
+        for memory_level in innermost_levels:
+            mem_ops = memory_level.operands
+            if layer_op_to_mem_op[act_operand] in mem_ops:
+                act_innermost_mem_level = memory_level
+                act_served_oa_dim: set = memory_level.served_dimensions
+                act_served_oa_dim_name = list(act_served_oa_dim)[0].name
+        # check if act is not served in the innermost memories, or it is uti-casting for act.
+        # keep the spatial loop as it was if act is not served.
+        if "act_served_oa_dim" not in locals() or len(act_served_oa_dim) == 0:
+            return input_mem_size_updated, self.accelerator
+        # get the mem scaling factor if OX, OY exist
+        mem_scaling_factor = 1
+        if (
+            act_served_oa_dim_name not in user_spatial_mapping.keys()
+        ):  # there is no sm loop
+            pass
+        else:  # there is sm loop on act served oa dim
+            act_served_oa_mapping = user_spatial_mapping[act_served_oa_dim_name]
+            if self.is_nested_tuple(
+                act_served_oa_mapping
+            ):  # a mix sm mapping, e.g. (("K", 2), ("OX", 5))
+                for element in act_served_oa_mapping:
+                    layer_dim = element[0]
+                    if layer_dim in weight_ir_layer_dims:
+                        layer_size = element[1]
+                        mem_scaling_factor *= layer_size
+            else:  # a single layer dim mapping
+                layer_dim = act_served_oa_mapping[0]
+                if layer_dim in weight_ir_layer_dims:
+                    layer_size = act_served_oa_mapping[1]
+                    mem_scaling_factor *= layer_size
+        # scale the mem size
+        if mem_scaling_factor == 1:
+            # No need to change the input mem size
+            return input_mem_size_updated, self.accelerator
+        else:
+            input_mem_size_updated = True
+            # Initialize the new memory hierarchy
+            mh_name = memory_hierarchy.name
+            new_mh_name = mh_name + "-supporting-diagonal-map"
+            new_memory_hierarchy = MemoryHierarchy(operational_array, new_mh_name)
+            # Add memories to the new memory hierarchy with the correct attributes
+            for curr_mem_level, memory_level in enumerate(
+                memory_hierarchy.mem_level_list
+            ):
+                memory_instance = memory_level.memory_instance
+                if memory_level == act_innermost_mem_level:
+                    memory_instance.size *= mem_scaling_factor  # scale here. For others, keep them unchanged.
+                operands = tuple(memory_level.operands)
+                port_alloc = memory_level.port_alloc_raw
+                served_dimensions_vec = memory_level.served_dimensions_vec
+                assert len(served_dimensions_vec) >= 1
+                served_dimensions = served_dimensions_vec[0]
+
+                new_memory_instance = pickle_deepcopy(memory_instance)
+                new_operands = pickle_deepcopy(operands)
+                new_port_alloc = pickle_deepcopy(port_alloc)
+                new_served_dimensions = pickle_deepcopy(served_dimensions)
+                new_memory_hierarchy.add_memory(
+                    memory_instance=new_memory_instance,
+                    operands=new_operands,
+                    port_alloc=new_port_alloc,
+                    served_dimensions=new_served_dimensions,
+                )
+            # Create the new core
+            id = core.id
+            dataflows = core.dataflows
+            new_id = id
+            new_dataflows = pickle_deepcopy(dataflows)
+
+            new_core = Core(
+                id=new_id,
+                operational_array=operational_array,
+                memory_hierarchy=new_memory_hierarchy,
+                dataflows=new_dataflows,
+            )
+
+            # Create the new accelerator
+            name = self.accelerator.name
+            new_name = name + "-supporting-diagonal-map"
+            new_cores = {new_core}
+            new_accelerator = Accelerator(
+                name=new_name,
+                core_set=new_cores,
+            )
+            return input_mem_size_updated, new_accelerator
+
+    @staticmethod
+    def is_nested_tuple(obj):
+        if isinstance(obj, tuple):
+            for item in obj:
+                if isinstance(item, tuple):
+                    # If any item within the tuple is itself a tuple, it's a nested tuple
+                    return True
+        return False

From 1f24d5636d934d50ba0f2f160d1a0b827f2e3f74 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Fri, 10 Nov 2023 20:20:06 +0100
Subject: [PATCH 07/14] delete debug.py

---
 debug.py | 26 --------------------------
 1 file changed, 26 deletions(-)
 delete mode 100644 debug.py

diff --git a/debug.py b/debug.py
deleted file mode 100644
index 918bf98a..00000000
--- a/debug.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from zigzag.api import get_hardware_performance_zigzag
-
-opt = 'EDP'
-model = "alexnet"
-onnx_model_path = f"zigzag/inputs/examples/workload/{model}.onnx"
-workload = onnx_model_path
-
-hwarchs = ["Edge_TPU_like", "Ascend_like", "Eyeriss_like", "Meta_prototype", "Tesla_NPU_like", "TPU_like"]
-
-for hwarch in hwarchs:
-
-    mapping = f"zigzag.inputs.examples.mapping.default"
-    accelerator = f"zigzag.inputs.examples.hardware.{hwarch}"
-
-    dump_filename_pattern=f"outputs/{hwarch}-{model}-layer_?.json"
-    pickle_filename = f"outputs/{hwarch}-{model}-saved_list_of_cmes.pickle"
-
-    energy, latency, cme = get_hardware_performance_zigzag(workload=workload,
-                                                           accelerator=accelerator,
-                                                           mapping=mapping,
-                                                           opt=opt,
-                                                           dump_filename_pattern=dump_filename_pattern,
-                                                           pickle_filename=pickle_filename)
-    print(f"Total network energy = {energy:.2e} pJ")
-    print(f"Total network latency = {latency:.2e} cycles")
-    print(f"Total edp = {energy*latency:.2e} pJ*cycles")
\ No newline at end of file

From 9b481bba991c98224996cd5f94e121570344220f Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Sat, 11 Nov 2023 19:55:03 +0100
Subject: [PATCH 08/14] Update the check condition when applying weight
 diagonal mapping. Now the check condition is more strict.

---
 zigzag/classes/opt/spatial/generator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/zigzag/classes/opt/spatial/generator.py b/zigzag/classes/opt/spatial/generator.py
index c3e35582..6d4f4a84 100644
--- a/zigzag/classes/opt/spatial/generator.py
+++ b/zigzag/classes/opt/spatial/generator.py
@@ -480,9 +480,11 @@ def add_input_pr_spatial_loop_if_enabled(
                 act_served_oa_dim: set = memory_level.served_dimensions
             if layer_op_to_mem_op[output_operand] in mem_ops:
                 output_served_oa_dim: set = memory_level.served_dimensions
-        # check if act is not served in the innermost memories, or it is uti-casting for act.
+        # check if act is not served in the innermost memories, or act/output is not multicasting on only one dimension.
         # keep the spatial loop as it was if act is not served.
-        if "act_served_oa_dim" not in locals() or len(act_served_oa_dim) == 0:
+        if "act_served_oa_dim" not in locals() or len(act_served_oa_dim) != 1:
+            return user_spatial_mapping
+        if "output_served_oa_dim" not in locals() or len(output_served_oa_dim) != 1:
             return user_spatial_mapping
 
         act_served_oa_dim_name = list(act_served_oa_dim)[0].name

From 18c39d2838458fd55de2a946a1abec245ea931c8 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Sun, 12 Nov 2023 19:18:36 +0100
Subject: [PATCH 09/14] Fix the issue that sometimes there is no sm loop
 yielded when a layer dim is mapped on multiple hardware dims

---
 zigzag/classes/opt/spatial/generator.py | 97 ++++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 2 deletions(-)

diff --git a/zigzag/classes/opt/spatial/generator.py b/zigzag/classes/opt/spatial/generator.py
index 6d4f4a84..f5d338c8 100644
--- a/zigzag/classes/opt/spatial/generator.py
+++ b/zigzag/classes/opt/spatial/generator.py
@@ -182,8 +182,9 @@ def generate_user_spatial_mappings(
         # Now we have to combine them into user-defined spatial mappings.
         # record down the number of yield
         yield_count = 0
+        yield_count_limit = 2  # used to control the yield count when maximize_hardware_utilization == True
         for combination in itertools.product(*unrollings):
-            if maximize_hardware_utilization and yield_count >= 2:
+            if maximize_hardware_utilization and yield_count >= yield_count_limit:
                 # 2 means: only check the top 2 spatial mapping with the highest hardware utilization
                 # Modify "2" to other numbers if you want to check on more spatial mappings.
                 break
@@ -214,12 +215,104 @@ def generate_user_spatial_mappings(
             yield user_spatial_mapping
             yield_count += 1
         # If yield_count==0, it means there is no legal spatial mapping found.
-        # The reason is that the spatial mapping provided by the user has exceeded the layer dim size,
+        # One reason is that the spatial mapping provided by the user has exceeded the layer dim size,
         # therefore the loop cannot pass the check.
+        # The other reason could be: there is a layer dim mapped on multiple oa dims,
+        # so the product has exceeded the layer dim size.
+        # For a quick fix on the second cause, we will reform the sm loop only for single layer dim mapping.
+        if yield_count == 0:
+            for combination in itertools.product(*unrollings):
+                is_mix_comb = False
+                for loop in combination:
+                    if self.is_nested_tuple(loop):
+                        is_mix_comb = True
+                        continue
+                if is_mix_comb:
+                    # The fix is not applied for mix sm loop.
+                    continue
+                if maximize_hardware_utilization and yield_count >= yield_count_limit:
+                    # 2 means: only check the top 2 spatial mapping with the highest hardware utilization
+                    # Modify "2" to other numbers if you want to check on more spatial mappings.
+                    break
+                new_combination, left_layer_dim_size = self.shrink_combination_when_a_layer_dim_is_mapped_on_multiple_oa_dims(
+                    combination=combination,
+                    layer=self.layer,
+                )
+                # Zip the combination (which is a (layer_dim, layer_size) for each oa_dim with the oa_dim names.
+                oa_dim_names = [oa_dim.name for oa_dim in oa_dims]
+
+                user_spatial_mapping = {
+                    oa_dim_name: unrolling
+                    for (oa_dim_name, unrolling) in zip(oa_dim_names, new_combination)
+                    if unrolling is not None
+                }
+                # Add act ir loop if it is weight stationary and the innermost memories serve for act.
+                if enable_weight_diagonal_mapping:
+                    user_spatial_mapping = self.add_input_pr_spatial_loop_if_enabled(
+                        layer=self.layer,
+                        provided_user_spatial_mapping=user_spatial_mapping,
+                        user_spatial_mapping_hint=user_spatial_mapping_hint,
+                        innermost_levels=innermost_levels,
+                        left_layer_dim_size=left_layer_dim_size,
+                        enable_mix_spatial_mapping_generation=enable_mix_spatial_mapping_generation,
+                    )
+                yield user_spatial_mapping
+                yield_count += 1
+
         assert (
             yield_count > 0
         ), "There is no legal spatial mapping found. Please make sure the provided spatial mappings do not exceed the layer dimension size."
 
+    def shrink_combination_when_a_layer_dim_is_mapped_on_multiple_oa_dims(
+        self, combination, layer
+    ):
+        new_combination = combination
+        legal_spatial_loop, left_layer_dim_size = self.check_spatial_loop_legality(
+            combination=new_combination, layer=layer
+        )
+        while not legal_spatial_loop:
+            new_combination_next = list(new_combination)
+            for layer_dim, layer_dim_size in left_layer_dim_size.items():
+                if layer_dim_size < 1:
+                    scaled_success = False
+                    for oa_index in range(len(new_combination_next)-1, -1, -1):  # reverse order on oa dims
+                        (mapped_layer_dim, mapped_layer_dim_size) = new_combination_next[oa_index]
+                        if mapped_layer_dim_size > 1:
+                            # shrink the mapped layer dim size
+                            mapped_layer_dim_size -= 1
+                            new_combination_next[oa_index] = (mapped_layer_dim, mapped_layer_dim_size)
+                            scaled_success = True
+                            break
+                        else:
+                            # because a layer can be mapped on multiple oa dims, we will move to the next oa dim.
+                            pass
+                    # assert: if not scaled_success,
+                    # it means the sm loop cannot pass the check, even though all mapped size on this layer dim is 1
+                    assert scaled_success, \
+                        f"The spatial loop cannot meet the current hardware dimension after scaling, " \
+                        f"Current spatial loop: {new_combination}"
+            new_combination_next = tuple(new_combination_next)
+            # Next we will judge if new_combination_next is a legal loop
+            # If it is, then we will keep the current combination, rather than new_combination_next,
+            # the reason is: new_combination can cover the entire layer dim, but new_combination_next is smaller than
+            # the layer dim, therefore the actual sm loop for the layer dim is a decimal number.
+            # In that case, we will ceil it up to mimic the real case on hardware.
+            legal_spatial_loop, left_layer_dim_size_next = self.check_spatial_loop_legality(
+                combination=new_combination_next, layer=layer
+            )
+            if not legal_spatial_loop:
+                new_combination = new_combination_next
+                left_layer_dim_size = left_layer_dim_size_next
+            else:
+                for layer_dim, layer_dim_size in left_layer_dim_size.items():
+                    # A special case when we will use new_combination_next when legal_spatial_loop == True
+                    # This case is when new_combination_next exactly match the layer dim size (left size == 1)
+                    if layer_dim_size < 1 and left_layer_dim_size_next[layer_dim] == 1:
+                        new_combination = new_combination_next
+                        left_layer_dim_size = left_layer_dim_size_next
+                        break
+        return new_combination, left_layer_dim_size
+
     def check_spatial_loop_legality(self, combination, layer):
         # Extra check on the total unrolling size of a layer dim, if it is mapped on >=2 dimensions.
         combination_check = {

From 20f541c719564da69476dc5efd0fb8d6042fcd33 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Mon, 13 Nov 2023 14:31:12 +0100
Subject: [PATCH 10/14] fix typo in cost models for pure pe and imc

---
 tests/main/test_origin/test_ascend_like.py    |   2 +-
 .../test_origin/test_meta_prototype_like.py   |   2 +-
 tests/main/test_origin/test_tesla_npu_like.py |   8 +-
 tests/main/test_origin/test_tpu_like.py       |   4 +-
 .../test_ascend_like.py                       |   2 +-
 .../test_ascend_like.py                       |   2 +-
 .../test_tesla_npu_like.py                    |   6 +-
 .../test_tpu_like.py                          |   4 +-
 zigzag/classes/cost_model/cost_model.py       |   2 +-
 zigzag/classes/opt/spatial/generator.py       |  29 +++-
 .../stages/SpatialMappingConversionStage.py   | 142 +++++++++++++-----
 11 files changed, 144 insertions(+), 59 deletions(-)

diff --git a/tests/main/test_origin/test_ascend_like.py b/tests/main/test_origin/test_ascend_like.py
index 248c6d0c..f92e9788 100644
--- a/tests/main/test_origin/test_ascend_like.py
+++ b/tests/main/test_origin/test_ascend_like.py
@@ -12,7 +12,7 @@
 # Expected energy and latency for each workload defined above
 ens_lats = {
     "zigzag/inputs/examples/workload/alexnet.onnx": (5738192980.375, 8728331),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1913797698.5250015, 7426499),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1913797698.5250015, 7439255),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1858697886.165, 3720129),
     "zigzag.inputs.examples.workload.resnet18": (2408671233.7250004, 4804196),
 }
diff --git a/tests/main/test_origin/test_meta_prototype_like.py b/tests/main/test_origin/test_meta_prototype_like.py
index e4299fba..4d8f397f 100644
--- a/tests/main/test_origin/test_meta_prototype_like.py
+++ b/tests/main/test_origin/test_meta_prototype_like.py
@@ -12,7 +12,7 @@
 # Expected energy and latency for each workload defined above
 ens_lats = {
     "zigzag/inputs/examples/workload/alexnet.onnx": (5771558839.89, 8400651),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1731935837.864999, 3594631),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1731935837.864999, 3606391),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1869519792.3449998, 3408373),
     "zigzag.inputs.examples.workload.resnet18": (2419893343.4549994, 4176163),
 }
diff --git a/tests/main/test_origin/test_tesla_npu_like.py b/tests/main/test_origin/test_tesla_npu_like.py
index 11a53097..f8a98a2c 100644
--- a/tests/main/test_origin/test_tesla_npu_like.py
+++ b/tests/main/test_origin/test_tesla_npu_like.py
@@ -11,10 +11,10 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (6131950030.816001, 8486444),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1671933042.2130003, 2909436),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1863717063.505, 3395752),
-    "zigzag.inputs.examples.workload.resnet18": (2375316568.8910007, 4082454),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (6131950030.816001, 8496179),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1671933042.2130003, 2964784),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1863717063.505, 3410738),
+    "zigzag.inputs.examples.workload.resnet18": (2375316568.8910007, 4096544),
 }
 
 
diff --git a/tests/main/test_origin/test_tpu_like.py b/tests/main/test_origin/test_tpu_like.py
index a2ca227f..d59700e6 100644
--- a/tests/main/test_origin/test_tpu_like.py
+++ b/tests/main/test_origin/test_tpu_like.py
@@ -11,8 +11,8 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (5567502618.941999, 9078209),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1904494517.552001, 23112606),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5567502618.941999, 9080913),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1904494517.552001, 23131716),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1795904779.6570003, 4160591),
     "zigzag.inputs.examples.workload.resnet18": (2296491401.491, 4909027),
 }
diff --git a/tests/main/test_with_mix_spatial_mapping/test_ascend_like.py b/tests/main/test_with_mix_spatial_mapping/test_ascend_like.py
index 182a872f..fcc09fa1 100644
--- a/tests/main/test_with_mix_spatial_mapping/test_ascend_like.py
+++ b/tests/main/test_with_mix_spatial_mapping/test_ascend_like.py
@@ -14,7 +14,7 @@
 # Expected energy and latency for each workload defined above
 ens_lats = {
     "zigzag/inputs/examples/workload/alexnet.onnx": (5667407342.66, 8528846),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (921552096.0700004, 3828967),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (921552096.0700004, 3835435),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1679218425.5100002, 3713386),
     "zigzag.inputs.examples.workload.resnet18": (2290766279.31, 4442443),
 }
diff --git a/tests/main/test_without_unused_memory/test_ascend_like.py b/tests/main/test_without_unused_memory/test_ascend_like.py
index 4eee129a..b6fc7a72 100644
--- a/tests/main/test_without_unused_memory/test_ascend_like.py
+++ b/tests/main/test_without_unused_memory/test_ascend_like.py
@@ -12,7 +12,7 @@
 # Expected energy and latency for each workload defined above
 ens_lats = {
     "zigzag/inputs/examples/workload/alexnet.onnx": (5649555894.9, 8637780),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1881386179.71, 6486685),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1881386179.71, 6499441),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1709089377.83, 3583047),
     "zigzag.inputs.examples.workload.resnet18": (2243493483.15, 4657130),
 }
diff --git a/tests/main/test_without_unused_memory/test_tesla_npu_like.py b/tests/main/test_without_unused_memory/test_tesla_npu_like.py
index 25eb9648..3ccaafb2 100644
--- a/tests/main/test_without_unused_memory/test_tesla_npu_like.py
+++ b/tests/main/test_without_unused_memory/test_tesla_npu_like.py
@@ -12,9 +12,9 @@
 # Expected energy and latency for each workload defined above
 ens_lats = {
     "zigzag/inputs/examples/workload/alexnet.onnx": (6040086796.366001, 8389669),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (930702060.6110002, 1965457),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681.4799998, 3257898),
-    "zigzag.inputs.examples.workload.resnet18": (2220861655.6660004, 3934616),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (930702060.6110002, 1969009),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681.4799998, 3267252),
+    "zigzag.inputs.examples.workload.resnet18": (2220861655.6660004, 3943074),
 }
 
 
diff --git a/tests/main/test_without_unused_memory/test_tpu_like.py b/tests/main/test_without_unused_memory/test_tpu_like.py
index 28df3fa1..ae1fe912 100644
--- a/tests/main/test_without_unused_memory/test_tpu_like.py
+++ b/tests/main/test_without_unused_memory/test_tpu_like.py
@@ -11,8 +11,8 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (5475639384.492001, 8979956),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (952688145.0069999, 21873214),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5475639384.492001, 8981556),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (952688145.0069999, 21873319),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1659252422.016, 4000289),
     "zigzag.inputs.examples.workload.resnet18": (1982830786.5119998, 4509235),
 }
diff --git a/zigzag/classes/cost_model/cost_model.py b/zigzag/classes/cost_model/cost_model.py
index d4296170..5369dfd1 100644
--- a/zigzag/classes/cost_model/cost_model.py
+++ b/zigzag/classes/cost_model/cost_model.py
@@ -251,7 +251,7 @@ def __init__(
         # self.spatial_mapping_dict_int = spatial_mapping_fractional_to_int(
         #     self.spatial_mapping.mapping_dict_origin
         # )
-        self.spatial_mapping_dict_int = self.spatial_mapping_int
+        self.spatial_mapping_dict_int = self.spatial_mapping_int.mapping_dict_origin
 
         # For constructing Mapping object,  the last parameter "self.access_same_data_considered_as_no_access" is optional
         self.mapping = Mapping(
diff --git a/zigzag/classes/opt/spatial/generator.py b/zigzag/classes/opt/spatial/generator.py
index f5d338c8..cf7d74c6 100644
--- a/zigzag/classes/opt/spatial/generator.py
+++ b/zigzag/classes/opt/spatial/generator.py
@@ -234,7 +234,10 @@ def generate_user_spatial_mappings(
                     # 2 means: only check the top 2 spatial mapping with the highest hardware utilization
                     # Modify "2" to other numbers if you want to check on more spatial mappings.
                     break
-                new_combination, left_layer_dim_size = self.shrink_combination_when_a_layer_dim_is_mapped_on_multiple_oa_dims(
+                (
+                    new_combination,
+                    left_layer_dim_size,
+                ) = self.shrink_combination_when_a_layer_dim_is_mapped_on_multiple_oa_dims(
                     combination=combination,
                     layer=self.layer,
                 )
@@ -275,12 +278,20 @@ def shrink_combination_when_a_layer_dim_is_mapped_on_multiple_oa_dims(
             for layer_dim, layer_dim_size in left_layer_dim_size.items():
                 if layer_dim_size < 1:
                     scaled_success = False
-                    for oa_index in range(len(new_combination_next)-1, -1, -1):  # reverse order on oa dims
-                        (mapped_layer_dim, mapped_layer_dim_size) = new_combination_next[oa_index]
+                    for oa_index in range(
+                        len(new_combination_next) - 1, -1, -1
+                    ):  # reverse order on oa dims
+                        (
+                            mapped_layer_dim,
+                            mapped_layer_dim_size,
+                        ) = new_combination_next[oa_index]
                         if mapped_layer_dim_size > 1:
                             # shrink the mapped layer dim size
                             mapped_layer_dim_size -= 1
-                            new_combination_next[oa_index] = (mapped_layer_dim, mapped_layer_dim_size)
+                            new_combination_next[oa_index] = (
+                                mapped_layer_dim,
+                                mapped_layer_dim_size,
+                            )
                             scaled_success = True
                             break
                         else:
@@ -288,16 +299,20 @@ def shrink_combination_when_a_layer_dim_is_mapped_on_multiple_oa_dims(
                             pass
                     # assert: if not scaled_success,
                     # it means the sm loop cannot pass the check, even though all mapped size on this layer dim is 1
-                    assert scaled_success, \
-                        f"The spatial loop cannot meet the current hardware dimension after scaling, " \
+                    assert scaled_success, (
+                        f"The spatial loop cannot meet the current hardware dimension after scaling, "
                         f"Current spatial loop: {new_combination}"
+                    )
             new_combination_next = tuple(new_combination_next)
             # Next we will judge if new_combination_next is a legal loop
             # If it is, then we will keep the current combination, rather than new_combination_next,
             # the reason is: new_combination can cover the entire layer dim, but new_combination_next is smaller than
             # the layer dim, therefore the actual sm loop for the layer dim is a decimal number.
             # In that case, we will ceil it up to mimic the real case on hardware.
-            legal_spatial_loop, left_layer_dim_size_next = self.check_spatial_loop_legality(
+            (
+                legal_spatial_loop,
+                left_layer_dim_size_next,
+            ) = self.check_spatial_loop_legality(
                 combination=new_combination_next, layer=layer
             )
             if not legal_spatial_loop:
diff --git a/zigzag/classes/stages/SpatialMappingConversionStage.py b/zigzag/classes/stages/SpatialMappingConversionStage.py
index 6f89c9fc..f995df88 100644
--- a/zigzag/classes/stages/SpatialMappingConversionStage.py
+++ b/zigzag/classes/stages/SpatialMappingConversionStage.py
@@ -52,7 +52,9 @@ def is_nested_tuple(obj):
 
     def run(self):
         user_spatial_mapping = self.layer.user_spatial_mapping
-        spatial_mapping, spatial_mapping_int = self.convert_user_spatial_mapping(user_spatial_mapping)
+        spatial_mapping, spatial_mapping_int = self.convert_user_spatial_mapping(
+            user_spatial_mapping
+        )
         # Since the spatial_mapping may be modified in the previous step,
         # we have to update this change to self.layer
         updated_user_spatial_mapping = {}
@@ -105,13 +107,29 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
         oa_dims = core.operational_array.dimensions
         layer_dim_sizes = self.layer.loop_dim_size.copy()
         limited_user_spatial_mapping = {}  # init dict we will be filling
+        limited_user_spatial_mapping_int = {}  # init dict int we will be filling
         for oa_dim_name, spatial_loop in user_spatial_mapping.items():
             if self.is_nested_tuple(spatial_loop):  # mix sm loop
                 limited_mix_user_spatial_mapping_on_dim = []
+                limited_mix_user_spatial_mapping_int_on_dim = []
                 for spatial_loop_element in spatial_loop:
                     limited_user_spatial_mapping_to_check = (
                         self.generate_limited_user_spatial_mapping(
-                            layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop_element, user_spatial_mapping
+                            layer_dim_sizes,
+                            oa_dims,
+                            oa_dim_name,
+                            spatial_loop_element,
+                            user_spatial_mapping,
+                        )
+                    )
+                    limited_user_spatial_mapping_int_to_check = (
+                        self.generate_limited_user_spatial_mapping(
+                            layer_dim_sizes,
+                            oa_dims,
+                            oa_dim_name,
+                            spatial_loop_element,
+                            user_spatial_mapping,
+                            False,
                         )
                     )
                     if limited_user_spatial_mapping_to_check == None:
@@ -120,19 +138,42 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                         limited_mix_user_spatial_mapping_on_dim.append(
                             limited_user_spatial_mapping_to_check
                         )
+                        limited_mix_user_spatial_mapping_int_on_dim.append(
+                            limited_user_spatial_mapping_int_to_check
+                        )
                 if len(limited_mix_user_spatial_mapping_on_dim) == 0:
                     continue  # Skip this spatial dimension if the defined dims in sm don't exist in the layer
                 else:
                     limited_mix_user_spatial_mapping_on_dim = tuple(
                         limited_mix_user_spatial_mapping_on_dim
                     )
+                    limited_mix_user_spatial_mapping_int_on_dim = tuple(
+                        limited_mix_user_spatial_mapping_int_on_dim
+                    )
                     limited_user_spatial_mapping[
                         oa_dim_name
                     ] = limited_mix_user_spatial_mapping_on_dim
+                    limited_user_spatial_mapping_int[
+                        oa_dim_name
+                    ] = limited_mix_user_spatial_mapping_int_on_dim
             else:  # single-dim sm loop
                 limited_user_spatial_mapping_to_check = (
                     self.generate_limited_user_spatial_mapping(
-                        layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop, user_spatial_mapping
+                        layer_dim_sizes,
+                        oa_dims,
+                        oa_dim_name,
+                        spatial_loop,
+                        user_spatial_mapping,
+                    )
+                )
+                limited_user_spatial_mapping_int_to_check = (
+                    self.generate_limited_user_spatial_mapping(
+                        layer_dim_sizes,
+                        oa_dims,
+                        oa_dim_name,
+                        spatial_loop,
+                        user_spatial_mapping,
+                        False,
                     )
                 )
                 if limited_user_spatial_mapping_to_check == None:
@@ -141,6 +182,9 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                     limited_user_spatial_mapping[
                         oa_dim_name
                     ] = limited_user_spatial_mapping_to_check
+                    limited_user_spatial_mapping_int[
+                        oa_dim_name
+                    ] = limited_user_spatial_mapping_int_to_check
             # Update the layer_dim_size to support multiple oa dims unrolling the same loop dim but not unrolling it more than the total layer dim
             # if (
             #     temporal_remainder == 1
@@ -163,14 +207,14 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
         spatial_mapping_dict = self.generate_spatial_mapping_dict(
             user_spatial_mapping=limited_user_spatial_mapping,
             layer=self.layer,
-            accelerator=self.accelerator
+            accelerator=self.accelerator,
         )
         # The next spatial_mapping_dict is used in cost model to calculate the interval between different data transfer.
-        # Different with the one above, there must only be integer numbers (corresponding to the real cases)
+        # Different with the one above, there are only integer numbers (corresponding to the real cases)
         spatial_mapping_dict_int = self.generate_spatial_mapping_dict(
-            user_spatial_mapping=user_spatial_mapping,
+            user_spatial_mapping=limited_user_spatial_mapping_int,
             layer=self.layer,
-            accelerator=self.accelerator
+            accelerator=self.accelerator,
         )
 
         return SpatialMapping(
@@ -180,7 +224,13 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
         )
 
     def generate_limited_user_spatial_mapping(
-        self, layer_dim_sizes, oa_dims, oa_dim_name, spatial_loop, user_spatial_mapping
+        self,
+        layer_dim_sizes,
+        oa_dims,
+        oa_dim_name,
+        spatial_loop,
+        user_spatial_mapping,
+        check_3=True,
     ):
         ## Do check on spatial mapping, and convert the mapping to a tuple
         (loop_dim_unrolled, loop_size_unrolled) = spatial_loop
@@ -195,29 +245,41 @@ def generate_limited_user_spatial_mapping(
         # Check 2: Limit unrolling if layer dimension is smaller than provided unrolling or if the loop dim doesn't exist
         layer_dim_size = layer_dim_sizes.get(loop_dim_unrolled, 1)
         loop_size_unrolled = min(layer_dim_size, loop_size_unrolled)
-        # Check 3: Adjust unrolling if it is not a multiple of the layer dimension size
-        # and if there is no more mapping for this layer dimension
-        no_more_mapping_for_current_layer_dim = self.check_if_there_is_further_oa_mapping_for_current_layer_dim(
-                oa_dim_name=oa_dim_name,
-                loop_dim_unrolled=loop_dim_unrolled,
-                user_spatial_mapping=user_spatial_mapping
-        )
-        if no_more_mapping_for_current_layer_dim:
-            loop_size_unrolled_on_early_oa_dims = self.calc_unrolled_loop_size_on_early_oa_dims(
-                oa_dim_name=oa_dim_name,
-                loop_dim_unrolled=loop_dim_unrolled,
-                user_spatial_mapping=user_spatial_mapping
+        if check_3:
+            # Check 3: Adjust unrolling if it is not a multiple of the layer dimension size
+            # and if there is no more mapping for this layer dimension
+            no_more_mapping_for_current_layer_dim = (
+                self.check_if_there_is_further_oa_mapping_for_current_layer_dim(
+                    oa_dim_name=oa_dim_name,
+                    loop_dim_unrolled=loop_dim_unrolled,
+                    user_spatial_mapping=user_spatial_mapping,
+                )
             )
-            temporal_remainder = int(np.ceil(layer_dim_size / (loop_size_unrolled*loop_size_unrolled_on_early_oa_dims)))
-            loop_size_unrolled = layer_dim_size / temporal_remainder / loop_size_unrolled_on_early_oa_dims
+            if no_more_mapping_for_current_layer_dim:
+                loop_size_unrolled_on_early_oa_dims = (
+                    self.calc_unrolled_loop_size_on_early_oa_dims(
+                        oa_dim_name=oa_dim_name,
+                        loop_dim_unrolled=loop_dim_unrolled,
+                        user_spatial_mapping=user_spatial_mapping,
+                    )
+                )
+                temporal_remainder = int(
+                    np.ceil(
+                        layer_dim_size
+                        / (loop_size_unrolled * loop_size_unrolled_on_early_oa_dims)
+                    )
+                )
+                loop_size_unrolled = (
+                    layer_dim_size
+                    / temporal_remainder
+                    / loop_size_unrolled_on_early_oa_dims
+                )
         return (
             loop_dim_unrolled,
             loop_size_unrolled,
         )
 
-    def generate_spatial_mapping_dict(
-        self, user_spatial_mapping, layer, accelerator
-    ):
+    def generate_spatial_mapping_dict(self, user_spatial_mapping, layer, accelerator):
         # This function is to convert spatial mapping to spatial_mapping_dict,
         # which attaches spatial mapping to different memory levels.
         spatial_mapping_dict = {}
@@ -253,8 +315,8 @@ def generate_spatial_mapping_dict(
                                     spatial_mapping_size,
                                 ) = sub_spatial_loop
                                 if (
-                                        spatial_mapping_dim
-                                        in spatial_mapping_lvl_dict.keys()
+                                    spatial_mapping_dim
+                                    in spatial_mapping_lvl_dict.keys()
                                 ):
                                     spatial_mapping_lvl_dict[
                                         spatial_mapping_dim
@@ -277,8 +339,8 @@ def generate_spatial_mapping_dict(
                         # as the spatial mapping representation is a level-by-level one.
                         del user_sm_copy[dim_name]
                 for (
-                        spatial_mapping_lvl_dict_dim,
-                        spatial_mapping_lvl_dict_size,
+                    spatial_mapping_lvl_dict_dim,
+                    spatial_mapping_lvl_dict_size,
                 ) in spatial_mapping_lvl_dict.items():
                     spatial_mapping_lvl.append(
                         (spatial_mapping_lvl_dict_dim, spatial_mapping_lvl_dict_size)
@@ -319,7 +381,9 @@ def check_if_there_is_further_oa_mapping_for_current_layer_dim(
                     loop_dim_unrolled_private = spatial_loop_private[0]
                     if loop_dim_unrolled == loop_dim_unrolled_private:
                         no_more_mapping_for_current_layer_dim = False
-            if not no_more_mapping_for_current_layer_dim: # early exit if the flag is already False
+            if (
+                not no_more_mapping_for_current_layer_dim
+            ):  # early exit if the flag is already False
                 break
         return no_more_mapping_for_current_layer_dim
 
@@ -332,12 +396,18 @@ def calc_unrolled_loop_size_on_early_oa_dims(
             if oa_dim_name == oa_dim_name_private:
                 break
             if self.is_nested_tuple(spatial_loop_private):  # mix sm loop
-                    for spatial_loop_element in spatial_loop_private:
-                        (loop_dim_unrolled_private, loop_size_unrolled_private) = spatial_loop_element
-                        if loop_dim_unrolled == loop_dim_unrolled_private:
-                            loop_unrolled_size_already *= loop_size_unrolled_private
+                for spatial_loop_element in spatial_loop_private:
+                    (
+                        loop_dim_unrolled_private,
+                        loop_size_unrolled_private,
+                    ) = spatial_loop_element
+                    if loop_dim_unrolled == loop_dim_unrolled_private:
+                        loop_unrolled_size_already *= loop_size_unrolled_private
             else:
-                (loop_dim_unrolled_private, loop_size_unrolled_private) = spatial_loop_private
+                (
+                    loop_dim_unrolled_private,
+                    loop_size_unrolled_private,
+                ) = spatial_loop_private
                 if loop_dim_unrolled == loop_dim_unrolled_private:
                     loop_unrolled_size_already *= loop_size_unrolled_private
-        return loop_unrolled_size_already
\ No newline at end of file
+        return loop_unrolled_size_already

From c684a49c0c6dd980f77019f9c86b7d8c27320eb3 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Mon, 13 Nov 2023 18:51:17 +0100
Subject: [PATCH 11/14] Reform the user-provided sm loop if it exceeds the
 layer size. Update pytest results.

---
 tests/main/test_origin/test_ascend_like.py    |  2 +-
 .../test_origin/test_meta_prototype_like.py   |  2 +-
 tests/main/test_origin/test_tesla_npu_like.py |  8 +--
 tests/main/test_origin/test_tpu_like.py       |  4 +-
 .../test_ascend_like.py                       |  2 +-
 .../test_edge_tpu_like.py                     |  8 +--
 .../test_meta_prototype_like.py               |  8 +--
 .../test_tesla_npu_like.py                    |  8 +--
 .../test_ascend_like.py                       |  2 +-
 .../test_tesla_npu_like.py                    |  6 +--
 .../test_tpu_like.py                          |  4 +-
 zigzag/classes/opt/spatial/generator.py       | 23 +++++++-
 .../stages/SpatialMappingConversionStage.py   | 53 ++++++++++++-------
 13 files changed, 82 insertions(+), 48 deletions(-)

diff --git a/tests/main/test_origin/test_ascend_like.py b/tests/main/test_origin/test_ascend_like.py
index f92e9788..248c6d0c 100644
--- a/tests/main/test_origin/test_ascend_like.py
+++ b/tests/main/test_origin/test_ascend_like.py
@@ -12,7 +12,7 @@
 # Expected energy and latency for each workload defined above
 ens_lats = {
     "zigzag/inputs/examples/workload/alexnet.onnx": (5738192980.375, 8728331),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1913797698.5250015, 7439255),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1913797698.5250015, 7426499),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1858697886.165, 3720129),
     "zigzag.inputs.examples.workload.resnet18": (2408671233.7250004, 4804196),
 }
diff --git a/tests/main/test_origin/test_meta_prototype_like.py b/tests/main/test_origin/test_meta_prototype_like.py
index 4d8f397f..e4299fba 100644
--- a/tests/main/test_origin/test_meta_prototype_like.py
+++ b/tests/main/test_origin/test_meta_prototype_like.py
@@ -12,7 +12,7 @@
 # Expected energy and latency for each workload defined above
 ens_lats = {
     "zigzag/inputs/examples/workload/alexnet.onnx": (5771558839.89, 8400651),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1731935837.864999, 3606391),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1731935837.864999, 3594631),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1869519792.3449998, 3408373),
     "zigzag.inputs.examples.workload.resnet18": (2419893343.4549994, 4176163),
 }
diff --git a/tests/main/test_origin/test_tesla_npu_like.py b/tests/main/test_origin/test_tesla_npu_like.py
index f8a98a2c..11a53097 100644
--- a/tests/main/test_origin/test_tesla_npu_like.py
+++ b/tests/main/test_origin/test_tesla_npu_like.py
@@ -11,10 +11,10 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (6131950030.816001, 8496179),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1671933042.2130003, 2964784),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1863717063.505, 3410738),
-    "zigzag.inputs.examples.workload.resnet18": (2375316568.8910007, 4096544),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (6131950030.816001, 8486444),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1671933042.2130003, 2909436),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1863717063.505, 3395752),
+    "zigzag.inputs.examples.workload.resnet18": (2375316568.8910007, 4082454),
 }
 
 
diff --git a/tests/main/test_origin/test_tpu_like.py b/tests/main/test_origin/test_tpu_like.py
index d59700e6..a2ca227f 100644
--- a/tests/main/test_origin/test_tpu_like.py
+++ b/tests/main/test_origin/test_tpu_like.py
@@ -11,8 +11,8 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (5567502618.941999, 9080913),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1904494517.552001, 23131716),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5567502618.941999, 9078209),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1904494517.552001, 23112606),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1795904779.6570003, 4160591),
     "zigzag.inputs.examples.workload.resnet18": (2296491401.491, 4909027),
 }
diff --git a/tests/main/test_with_mix_spatial_mapping/test_ascend_like.py b/tests/main/test_with_mix_spatial_mapping/test_ascend_like.py
index fcc09fa1..182a872f 100644
--- a/tests/main/test_with_mix_spatial_mapping/test_ascend_like.py
+++ b/tests/main/test_with_mix_spatial_mapping/test_ascend_like.py
@@ -14,7 +14,7 @@
 # Expected energy and latency for each workload defined above
 ens_lats = {
     "zigzag/inputs/examples/workload/alexnet.onnx": (5667407342.66, 8528846),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (921552096.0700004, 3835435),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (921552096.0700004, 3828967),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1679218425.5100002, 3713386),
     "zigzag.inputs.examples.workload.resnet18": (2290766279.31, 4442443),
 }
diff --git a/tests/main/test_with_mix_spatial_mapping/test_edge_tpu_like.py b/tests/main/test_with_mix_spatial_mapping/test_edge_tpu_like.py
index 86acd714..8287ba69 100644
--- a/tests/main/test_with_mix_spatial_mapping/test_edge_tpu_like.py
+++ b/tests/main/test_with_mix_spatial_mapping/test_edge_tpu_like.py
@@ -13,10 +13,10 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (5582430184.085, 8343378),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (762066732.5049998, 3003074),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1743190534.155, 5305825),
-    "zigzag.inputs.examples.workload.resnet18": (2087322696.315, 6155355),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5582059481.445, 8343378),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (819971935.77, 2430583),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1763135800.67, 5001291),
+    "zigzag.inputs.examples.workload.resnet18": (2090252961.0700002, 5858437),
 }
 
 
diff --git a/tests/main/test_with_mix_spatial_mapping/test_meta_prototype_like.py b/tests/main/test_with_mix_spatial_mapping/test_meta_prototype_like.py
index ff7ea9a8..c002b8d9 100644
--- a/tests/main/test_with_mix_spatial_mapping/test_meta_prototype_like.py
+++ b/tests/main/test_with_mix_spatial_mapping/test_meta_prototype_like.py
@@ -13,10 +13,10 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (5681909351.240001, 8299150),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (919452681.2249999, 2894129),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1789888904.4450002, 3472280),
-    "zigzag.inputs.examples.workload.resnet18": (2348207081.7949996, 4238517),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5679695605, 8299150),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (901092009, 2610609),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1730672410, 3262009),
+    "zigzag.inputs.examples.workload.resnet18": (2265438430, 4017227),
 }
 
 
diff --git a/tests/main/test_with_mix_spatial_mapping/test_tesla_npu_like.py b/tests/main/test_with_mix_spatial_mapping/test_tesla_npu_like.py
index 682604d4..c4b0c5e6 100644
--- a/tests/main/test_with_mix_spatial_mapping/test_tesla_npu_like.py
+++ b/tests/main/test_with_mix_spatial_mapping/test_tesla_npu_like.py
@@ -13,10 +13,10 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (6040086796.366001, 8389669),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (958401881.3470002, 1964453),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681.4799998, 3257898),
-    "zigzag.inputs.examples.workload.resnet18": (2220861655.6660004, 3934616),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (6044768678, 8370470),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (930702060, 1965457),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681, 3257898),
+    "zigzag.inputs.examples.workload.resnet18": (2220861655, 3934616),
 }
 
 
diff --git a/tests/main/test_without_unused_memory/test_ascend_like.py b/tests/main/test_without_unused_memory/test_ascend_like.py
index b6fc7a72..4eee129a 100644
--- a/tests/main/test_without_unused_memory/test_ascend_like.py
+++ b/tests/main/test_without_unused_memory/test_ascend_like.py
@@ -12,7 +12,7 @@
 # Expected energy and latency for each workload defined above
 ens_lats = {
     "zigzag/inputs/examples/workload/alexnet.onnx": (5649555894.9, 8637780),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1881386179.71, 6499441),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (1881386179.71, 6486685),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1709089377.83, 3583047),
     "zigzag.inputs.examples.workload.resnet18": (2243493483.15, 4657130),
 }
diff --git a/tests/main/test_without_unused_memory/test_tesla_npu_like.py b/tests/main/test_without_unused_memory/test_tesla_npu_like.py
index 3ccaafb2..25eb9648 100644
--- a/tests/main/test_without_unused_memory/test_tesla_npu_like.py
+++ b/tests/main/test_without_unused_memory/test_tesla_npu_like.py
@@ -12,9 +12,9 @@
 # Expected energy and latency for each workload defined above
 ens_lats = {
     "zigzag/inputs/examples/workload/alexnet.onnx": (6040086796.366001, 8389669),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (930702060.6110002, 1969009),
-    "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681.4799998, 3267252),
-    "zigzag.inputs.examples.workload.resnet18": (2220861655.6660004, 3943074),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (930702060.6110002, 1965457),
+    "zigzag/inputs/examples/workload/resnet18.onnx": (1724869681.4799998, 3257898),
+    "zigzag.inputs.examples.workload.resnet18": (2220861655.6660004, 3934616),
 }
 
 
diff --git a/tests/main/test_without_unused_memory/test_tpu_like.py b/tests/main/test_without_unused_memory/test_tpu_like.py
index ae1fe912..28df3fa1 100644
--- a/tests/main/test_without_unused_memory/test_tpu_like.py
+++ b/tests/main/test_without_unused_memory/test_tpu_like.py
@@ -11,8 +11,8 @@
 
 # Expected energy and latency for each workload defined above
 ens_lats = {
-    "zigzag/inputs/examples/workload/alexnet.onnx": (5475639384.492001, 8981556),
-    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (952688145.0069999, 21873319),
+    "zigzag/inputs/examples/workload/alexnet.onnx": (5475639384.492001, 8979956),
+    "zigzag/inputs/examples/workload/mobilenetv2.onnx": (952688145.0069999, 21873214),
     "zigzag/inputs/examples/workload/resnet18.onnx": (1659252422.016, 4000289),
     "zigzag.inputs.examples.workload.resnet18": (1982830786.5119998, 4509235),
 }
diff --git a/zigzag/classes/opt/spatial/generator.py b/zigzag/classes/opt/spatial/generator.py
index cf7d74c6..4a7d4ddc 100644
--- a/zigzag/classes/opt/spatial/generator.py
+++ b/zigzag/classes/opt/spatial/generator.py
@@ -134,7 +134,28 @@ def generate_user_spatial_mappings(
                 defined_mapping is not None
                 and defined_mapping.get(oa_dim.name) is not None
             ):
-                oa_dim_unrollings = [defined_mapping.get(oa_dim.name)]
+                # scale down the defined_mapping size if it exceeds the layer dim size
+                ori_loop = defined_mapping.get(oa_dim.name)
+                loop_to_reform = []
+                if self.is_nested_tuple(ori_loop):  # mix sm loop
+                    for sub_loop in ori_loop:
+                        sub_loop_dim = sub_loop[0]
+                        sub_loop_size = sub_loop[1]
+                        if sub_loop_dim in self.layer.loop_dim_size.keys():
+                            if sub_loop_size > self.layer.loop_dim_size[sub_loop_dim]:
+                                sub_loop_size = self.layer.loop_dim_size[sub_loop_dim]
+                            loop_to_reform.append((sub_loop_dim, sub_loop_size))
+                else:  # single layer sm loop
+                    loop_dim = ori_loop[0]
+                    loop_size = ori_loop[1]
+                    if loop_dim in self.layer.loop_dim_size.keys():
+                        if loop_size > self.layer.loop_dim_size[loop_dim]:
+                            loop_size = self.layer.loop_dim_size[loop_dim]
+                        loop_to_reform.append((loop_dim, loop_size))
+                loop_to_reform = tuple(loop_to_reform)
+                if len(loop_to_reform) == 0:
+                    loop_to_reform = None
+                oa_dim_unrollings = [loop_to_reform]
             else:
                 oa_dim_unrollings = []
                 oa_dim_unrolling_hints = user_spatial_mapping_hint[oa_dim.name]
diff --git a/zigzag/classes/stages/SpatialMappingConversionStage.py b/zigzag/classes/stages/SpatialMappingConversionStage.py
index f995df88..b55d3cf4 100644
--- a/zigzag/classes/stages/SpatialMappingConversionStage.py
+++ b/zigzag/classes/stages/SpatialMappingConversionStage.py
@@ -120,6 +120,7 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                             oa_dim_name,
                             spatial_loop_element,
                             user_spatial_mapping,
+                            limited_user_spatial_mapping,
                         )
                     )
                     limited_user_spatial_mapping_int_to_check = (
@@ -129,6 +130,7 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                             oa_dim_name,
                             spatial_loop_element,
                             user_spatial_mapping,
+                            limited_user_spatial_mapping,
                             False,
                         )
                     )
@@ -164,6 +166,7 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                         oa_dim_name,
                         spatial_loop,
                         user_spatial_mapping,
+                        limited_user_spatial_mapping,
                     )
                 )
                 limited_user_spatial_mapping_int_to_check = (
@@ -173,6 +176,7 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
                         oa_dim_name,
                         spatial_loop,
                         user_spatial_mapping,
+                        limited_user_spatial_mapping,
                         False,
                     )
                 )
@@ -230,7 +234,8 @@ def generate_limited_user_spatial_mapping(
         oa_dim_name,
         spatial_loop,
         user_spatial_mapping,
-        check_3=True,
+        limited_user_spatial_mapping,
+        allow_decimal_sm_loop_size=True,
     ):
         ## Do check on spatial mapping, and convert the mapping to a tuple
         (loop_dim_unrolled, loop_size_unrolled) = spatial_loop
@@ -245,35 +250,43 @@ def generate_limited_user_spatial_mapping(
         # Check 2: Limit unrolling if layer dimension is smaller than provided unrolling or if the loop dim doesn't exist
         layer_dim_size = layer_dim_sizes.get(loop_dim_unrolled, 1)
         loop_size_unrolled = min(layer_dim_size, loop_size_unrolled)
-        if check_3:
-            # Check 3: Adjust unrolling if it is not a multiple of the layer dimension size
-            # and if there is no more mapping for this layer dimension
-            no_more_mapping_for_current_layer_dim = (
-                self.check_if_there_is_further_oa_mapping_for_current_layer_dim(
+        # Check 3: Adjust unrolling if it is not a multiple of the layer dimension size
+        # and if there is no more mapping for this layer dimension
+        no_more_mapping_for_current_layer_dim = (
+            self.check_if_there_is_further_oa_mapping_for_current_layer_dim(
+                oa_dim_name=oa_dim_name,
+                loop_dim_unrolled=loop_dim_unrolled,
+                user_spatial_mapping=user_spatial_mapping,
+            )
+        )
+        if no_more_mapping_for_current_layer_dim:
+            loop_size_unrolled_on_early_oa_dims = (
+                self.calc_unrolled_loop_size_on_early_oa_dims(
                     oa_dim_name=oa_dim_name,
                     loop_dim_unrolled=loop_dim_unrolled,
-                    user_spatial_mapping=user_spatial_mapping,
+                    user_spatial_mapping=limited_user_spatial_mapping,
                 )
             )
-            if no_more_mapping_for_current_layer_dim:
-                loop_size_unrolled_on_early_oa_dims = (
-                    self.calc_unrolled_loop_size_on_early_oa_dims(
-                        oa_dim_name=oa_dim_name,
-                        loop_dim_unrolled=loop_dim_unrolled,
-                        user_spatial_mapping=user_spatial_mapping,
-                    )
-                )
-                temporal_remainder = int(
-                    np.ceil(
-                        layer_dim_size
-                        / (loop_size_unrolled * loop_size_unrolled_on_early_oa_dims)
-                    )
+            temporal_remainder = int(
+                np.ceil(
+                    layer_dim_size
+                    / (loop_size_unrolled * loop_size_unrolled_on_early_oa_dims)
                 )
+            )
+            if allow_decimal_sm_loop_size:
                 loop_size_unrolled = (
                     layer_dim_size
                     / temporal_remainder
                     / loop_size_unrolled_on_early_oa_dims
                 )
+            else:
+                loop_size_unrolled = int(
+                    np.ceil(
+                        layer_dim_size
+                        / temporal_remainder
+                        / loop_size_unrolled_on_early_oa_dims
+                    )
+                )
         return (
             loop_dim_unrolled,
             loop_size_unrolled,

From 79df7f8c7773a1e70e624f69cc8583f633ffc9a2 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Mon, 13 Nov 2023 18:59:44 +0100
Subject: [PATCH 12/14] update top_level_spatial_mapping in
 SpatialMappingConversionStage to support mix sm loop

---
 .../stages/SpatialMappingConversionStage.py   | 41 ++++++++++++++++---
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/zigzag/classes/stages/SpatialMappingConversionStage.py b/zigzag/classes/stages/SpatialMappingConversionStage.py
index b55d3cf4..7d08ae4f 100644
--- a/zigzag/classes/stages/SpatialMappingConversionStage.py
+++ b/zigzag/classes/stages/SpatialMappingConversionStage.py
@@ -220,12 +220,21 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
             layer=self.layer,
             accelerator=self.accelerator,
         )
+        try:
+            SpatialMapping(spatial_mapping_dict=spatial_mapping_dict, layer_node=self.layer), SpatialMapping(
+                spatial_mapping_dict=spatial_mapping_dict_int, layer_node=self.layer
+            )
+        except:
+            pass
 
-        return SpatialMapping(
-            spatial_mapping_dict=spatial_mapping_dict, layer_node=self.layer
-        ), SpatialMapping(
-            spatial_mapping_dict=spatial_mapping_dict_int, layer_node=self.layer
-        )
+        try:
+            return SpatialMapping(
+                spatial_mapping_dict=spatial_mapping_dict, layer_node=self.layer
+            ), SpatialMapping(
+                spatial_mapping_dict=spatial_mapping_dict_int, layer_node=self.layer
+            )
+        except:
+            pass
 
     def generate_limited_user_spatial_mapping(
         self,
@@ -363,8 +372,28 @@ def generate_spatial_mapping_dict(self, user_spatial_mapping, layer, accelerator
             # After we have gone through the memory levels, if there are still user-defined dimensions
             # present, add them as the top level. Otherwise add an empty list to make arch levels correct:
             # because first list we added was the operational array level.
+
+            # We will merge together if the top memory level is serving multiple oa dims
+            # and there are layer dims existing on multiple oa dims.
+            top_level_spatial_mapping_dict = {}
+            for (dim_name, spatial_loop) in user_sm_copy.items():
+                if self.is_nested_tuple(spatial_loop):  # mix sm loop
+                    for sub_spatial_loop in spatial_loop:
+                        spatial_loop_dim = sub_spatial_loop[0]
+                        spatial_loop_size = sub_spatial_loop[1]
+                        if spatial_loop_dim not in top_level_spatial_mapping_dict.keys():
+                            top_level_spatial_mapping_dict[spatial_loop_dim] = spatial_loop_size
+                        else:
+                            top_level_spatial_mapping_dict[spatial_loop_dim] *= spatial_loop_size
+                else:
+                    spatial_loop_dim = spatial_loop[0]
+                    spatial_loop_size = spatial_loop[1]
+                    if spatial_loop_dim not in top_level_spatial_mapping_dict.keys():
+                        top_level_spatial_mapping_dict[spatial_loop_dim] = spatial_loop_size
+                    else:
+                        top_level_spatial_mapping_dict[spatial_loop_dim] *= spatial_loop_size
             top_level_spatial_mapping = [
-                spatial_loop for (dim_name, spatial_loop) in user_sm_copy.items()
+                (layer_dim, layer_size) for (layer_dim, layer_size) in top_level_spatial_mapping_dict.items()
             ]
             spatial_mapping_dict[layer_op].append(top_level_spatial_mapping)
         return spatial_mapping_dict

From aff0de7a3aac42427a53b3a12e7a6c4aabd2a297 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Mon, 13 Nov 2023 19:18:33 +0100
Subject: [PATCH 13/14] update SeachUnusedMemoryStage and keep top weight mem
 to be a mem that serves all hardware dims

---
 .../classes/stages/SearchUnusedMemoryStage.py | 33 +++++++++++++++++--
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/zigzag/classes/stages/SearchUnusedMemoryStage.py b/zigzag/classes/stages/SearchUnusedMemoryStage.py
index c63136b0..5143ee9f 100644
--- a/zigzag/classes/stages/SearchUnusedMemoryStage.py
+++ b/zigzag/classes/stages/SearchUnusedMemoryStage.py
@@ -302,9 +302,16 @@ def update_top_mem_level(self):
                     if (
                         const_operand in served_operands
                     ):  # identify the top weight mem level
+                        # We need to check if the current mem serve all oa dims, otherwise we will not decrease
+                        # the mem_update_weight.
+                        # The reason is if the current mem not serve all oa dims, the mapping will impact the memory
+                        # utilization, so solely comparing with total memory size will be incorrect.
+                        mem_serve_all_oa_dims = self.check_if_mem_serve_all_oa_dims(
+                            mem, self.accelerator
+                        )
                         if (
                             curr_mem_level < self.mem_update_weight
-                        ):  # mem_update_weight is bigger than the top weight mem level
+                        ) and mem_serve_all_oa_dims:  # mem_update_weight is bigger than the top weight mem level
                             self.mem_update_weight = curr_mem_level
                         break
             else:  ## node (layer) that is not a branch starting node or a branch final node
@@ -402,9 +409,18 @@ def update_top_mem_level(self):
                                 self.update_IO_mem_level(
                                     curr_id, output_operand, curr_mem_level
                                 )  # update output mem level
+                            # For weight, we need to check if the current mem serve all oa dims, otherwise we will not
+                            # decrease the mem_update_weight.
+                            # The reason is if the current mem not serve all oa dims, the mapping will impact the memory
+                            # utilization, so solely comparing with total memory size will be incorrect.
+                            mem_serve_all_oa_dims = self.check_if_mem_serve_all_oa_dims(
+                                mem, self.accelerator
+                            )
                             if (
-                                curr_mem_level < self.mem_update_weight
-                            ) and mem_serve_weight:  # update weight mem level
+                                (curr_mem_level < self.mem_update_weight)
+                                and mem_serve_all_oa_dims
+                                and mem_serve_weight
+                            ):  # update weight mem level
                                 self.mem_update_weight = curr_mem_level
         ## [OPTIONAL CHECK] assert check if there is -1 value in mem_update_list
         ## [NOTE] Until here, if there is still -1 value in mem_update_list, it means the size of top mem level for IO is not big enough.
@@ -414,6 +430,17 @@ def update_top_mem_level(self):
                     list(operand_dict.values())[0] >= 0
                 ), "SearchUnusedMemoryStage fisnishes abnormally, there are still layers with top mem levels not figured out."
 
+    def check_if_mem_serve_all_oa_dims(self, mem, accelerator):
+        # check if mem serve all hardare dimensions
+        core = accelerator.cores[0]
+        operational_array = core.operational_array
+        oa_dim_nb = len(operational_array.dimensions)
+        mem_served_oa_dim_nb = len(mem.served_dimensions)
+        if mem_served_oa_dim_nb == oa_dim_nb:
+            return True
+        else:
+            return False
+
     def update_mem_level_for_loading_data(self):
         """
         [OPTIONAL FUNCTION] This is an optional function.

From 47c2cf585b4032f7aed8b5b5a99d70026f3e72e1 Mon Sep 17 00:00:00 2001
From: JiacongSun <jiacong.sun@kuleuven.be>
Date: Mon, 13 Nov 2023 20:09:29 +0100
Subject: [PATCH 14/14] remove codes for debugging

---
 .../stages/SpatialMappingConversionStage.py   | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/zigzag/classes/stages/SpatialMappingConversionStage.py b/zigzag/classes/stages/SpatialMappingConversionStage.py
index 7d08ae4f..617921e7 100644
--- a/zigzag/classes/stages/SpatialMappingConversionStage.py
+++ b/zigzag/classes/stages/SpatialMappingConversionStage.py
@@ -220,21 +220,12 @@ def convert_user_spatial_mapping(self, user_spatial_mapping):
             layer=self.layer,
             accelerator=self.accelerator,
         )
-        try:
-            SpatialMapping(spatial_mapping_dict=spatial_mapping_dict, layer_node=self.layer), SpatialMapping(
-                spatial_mapping_dict=spatial_mapping_dict_int, layer_node=self.layer
-            )
-        except:
-            pass
 
-        try:
-            return SpatialMapping(
-                spatial_mapping_dict=spatial_mapping_dict, layer_node=self.layer
-            ), SpatialMapping(
-                spatial_mapping_dict=spatial_mapping_dict_int, layer_node=self.layer
-            )
-        except:
-            pass
+        return SpatialMapping(
+            spatial_mapping_dict=spatial_mapping_dict, layer_node=self.layer
+        ), SpatialMapping(
+            spatial_mapping_dict=spatial_mapping_dict_int, layer_node=self.layer
+        )
 
     def generate_limited_user_spatial_mapping(
         self,