From 5a9c4b2cf3170c9966b5716f5b651c02a20aeaf2 Mon Sep 17 00:00:00 2001
From: Francesco Conti <f.conti@unibo.it>
Date: Tue, 16 Apr 2024 21:14:27 +0000
Subject: [PATCH] Align to newer N-EUREKA configuration

---
 test/NeuralEngineFunctionalModel.py |   4 +
 test/NeurekaMemoryLayout.py         |  30 ++----
 test/NeurekaMemoryLayoutSiracusa.py | 156 ++++++++++++++++++++++++++++
 test/testgen.py                     |  19 +++-
 4 files changed, 188 insertions(+), 21 deletions(-)
 create mode 100644 test/NeurekaMemoryLayoutSiracusa.py

diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py
index b41702b..32bcc5a 100644
--- a/test/NeuralEngineFunctionalModel.py
+++ b/test/NeuralEngineFunctionalModel.py
@@ -61,6 +61,10 @@ def _norm_quant(
 
         tensor = tensor >> global_shift
 
+        if verbose:
+            print("INTERMEDIATE RESULTS (after shift):")
+            print(tensor)
+
         # Saturate into out_type
         tensor = NeuralEngineFunctionalModel._cast(tensor, out_type, saturate=True)
 
diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py
index 028c7a3..d5f266b 100644
--- a/test/NeurekaMemoryLayout.py
+++ b/test/NeurekaMemoryLayout.py
@@ -24,7 +24,7 @@
 class NeurekaMemoryLayout:
     _WEIGHT_BANDWIDTH = 256
     _CIN_SUBTILE_1x1 = 32
-    _CIN_SUBTILE_3x3 = 28
+    _CIN_SUBTILE_3x3 = 32
 
     @staticmethod
     def weightEncode(
@@ -84,26 +84,16 @@ def weightEncode(
                 constant_values=0,
             )
         elif height == 1 and width == 1:
-            # Tile cinSubtile into tiles of size 4
-            # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
-            weight = weight.reshape(
-                cout, cinMajor, bits, height * width, cinSubtile // 4, 4
-            )  # cout, cinMajor, bits, 1, 8, 4
-            # Pad bits to 8
-            if bits < 8:
-                # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
-                weight = np.pad(
-                    weight,
-                    ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)),
-                    mode="constant",
-                    constant_values=0,
-                )
-            # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile)
-            weight = weight.transpose(0, 1, 3, 4, 2, 5)
+            # (cout * cinMajor, Bits * cinSubtile)
+            weight = weight.reshape(-1, bits * cinSubtile)
+            # Pad only the last dimension to weight bandwidth size
             # (-1, Weight Bandwidth)
-            weight = weight.reshape(
-                cout * cinMajor, NeurekaMemoryLayout._WEIGHT_BANDWIDTH
-            )  # cout*cinMajor, 256b
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH - weight.shape[-1])),
+                "constant",
+                constant_values=0,
+            )
 
         # Prepare for packing
         # (-1, Weight Bandwidth Bytes, 8)
diff --git a/test/NeurekaMemoryLayoutSiracusa.py b/test/NeurekaMemoryLayoutSiracusa.py
new file mode 100644
index 0000000..748bd34
--- /dev/null
+++ b/test/NeurekaMemoryLayoutSiracusa.py
@@ -0,0 +1,156 @@
+# Luka Macan <luka.macan@unibo.it>
+# Arpan Suravi Prasad <prasadar@iis.ee.ethz.ch>
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import numpy.typing as npt
+
+
+class NeurekaMemoryLayoutSiracusa:
+    _WEIGHT_BANDWIDTH = 256
+    _CIN_SUBTILE_1x1 = 32
+    _CIN_SUBTILE_3x3 = 28
+
+    @staticmethod
+    def weightEncode(
+        weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
+    ) -> npt.NDArray[np.uint8]:
+        """Unroll weight into expected memory format
+
+        Expected weight shape is (cout, cin, H, W).
+        The produced memory layout depends on the weight kernel shape:
+          - 3x3: (cout, cinMajor, Bits, H x W x cinMinor_3x3 packed into Weight Bandwidth bits),
+          - 1x1: (cout, cinMajor, Bits x H x W x cinMinor_1x1 packed into Weight Bandwidth bits),
+        where cinMajor is the ceil(cin / cin subtile <mode>) and cinMinor has to be padded with 0 to cin subtile <mode>.
+        """
+        if depthwise:
+            weight = weight.transpose(1, 0, 2, 3)  # Swap cout and cin
+
+        cout, cin, height, width = weight.shape
+        cinSubtile = (
+            NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_3x3
+            if height == 3
+            else NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_1x1
+        )
+
+        # Pad cin to be divisible with CIN_SUBTILE
+        if cin % cinSubtile != 0:
+            cinPad = cinSubtile - cin % cinSubtile
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, cinPad), (0, 0), (0, 0)),
+                "constant",
+                constant_values=0,
+            )
+
+        # Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1)
+        # The 1 at the end is required by the unpacking
+        cinMajor = int(np.ceil(cin / cinSubtile))
+        weight = weight.reshape(cout, cinMajor, cinSubtile, height * width, 1)
+
+        # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
+        # (cout, cinMajor, cinSubtile, Flattened spatial, Bits)
+        weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
+
+        # Shuffle bits so that the final shape is:
+        # (cout, cinMajor, Bits, Flattened spatial, cinSubtile)
+        weight = weight.transpose(0, 1, 4, 3, 2)
+
+        # Pack dimensions to fit into weight bandwidth
+        if height == 3 and width == 3:
+            # (cout * cinMajor * Bits, H * W * cinSubtile)
+            weight = weight.reshape(-1, height * width * cinSubtile)
+            # Pad only the last dimension to weight bandwidth size
+            # (-1, Weight Bandwidth)
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH - weight.shape[-1])),
+                "constant",
+                constant_values=0,
+            )
+        elif height == 1 and width == 1:
+            # Tile cinSubtile into tiles of size 4
+            # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+            weight = weight.reshape(
+                cout, cinMajor, bits, height * width, cinSubtile // 4, 4
+            )  # cout, cinMajor, bits, 1, 8, 4
+            # Pad bits to 8
+            if bits < 8:
+                # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+                weight = np.pad(
+                    weight,
+                    ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile)
+            weight = weight.transpose(0, 1, 3, 4, 2, 5)
+            # (-1, Weight Bandwidth)
+            weight = weight.reshape(
+                cout * cinMajor, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH
+            )  # cout*cinMajor, 256b
+
+        # Prepare for packing
+        # (-1, Weight Bandwidth Bytes, 8)
+        weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH / 8))
+        weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2)
+
+        # Pack bits
+        # (-1, Weight Bandwidth Bytes)
+        weight = np.packbits(weight, axis=-1, bitorder="little")
+
+        return weight.flatten()
+
+    @staticmethod
+    def weightDecode(
+        weight: npt.NDArray[np.uint8],
+        bits: int,
+        cout: int,
+        cin: int,
+        height: int,
+        width: int,
+    ) -> npt.NDArray[np.uint8]:
+        """Reverse of weightEncode"""
+        cinSubtile = (
+            NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_3x3
+            if height == 3
+            else NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_1x1
+        )
+        cinMajor = int(np.ceil(cin / cinSubtile))
+        cinMinor = cinSubtile
+        weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH / 8))
+
+        weight = weight.reshape(-1, weightBandwidthBytes, 1)
+        weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
+        weight = weight.reshape(-1, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH)
+
+        if height == 3 and width == 3:
+            weight = weight[:, : height * width * cinMinor]
+            weight = weight.reshape(
+                cout, cinMajor, bits, height * width, cinMinor
+            ).transpose(0, 1, 4, 3, 2)
+        elif height == 1 and width == 1:
+            weight = weight[:, : height * width * cinMinor * 8]
+            weight = weight.reshape(cout, cinMajor, cinMinor // 4, 8, 4).transpose(
+                0, 1, 2, 4, 3
+            )
+        weight = np.packbits(weight, axis=-1, bitorder="little")
+        weight = weight.reshape(cout, cinMajor * cinMinor, height, width)
+        weight = weight[:, :cin, :, :]
+
+        return weight
diff --git a/test/testgen.py b/test/testgen.py
index 521aecc..c128ff4 100644
--- a/test/testgen.py
+++ b/test/testgen.py
@@ -20,6 +20,8 @@
 import json
 import os
 from typing import Optional, Set, Type, Union
+import numpy as np
+import sys
 
 import toml
 
@@ -52,6 +54,7 @@ def headers_gen(
 
 
 def print_tensors(test: NnxTest):
+    np.set_printoptions(threshold=sys.maxsize)
     print("INPUT TENSOR:")
     print(test.input)
     print("WEIGHT TENSOR:")
@@ -83,7 +86,21 @@ def test_gen(
         exit(-1)
 
     test_conf = nnxTestConfCls.model_validate(test_conf_dict)
-    test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors)
+    if test_conf_dict['synthetic_weights']:
+        import torch
+        weight = torch.zeros((test_conf.out_channel, 1 if test_conf.depthwise else test_conf.in_channel, test_conf.kernel_shape.height, test_conf.kernel_shape.width), dtype=torch.int64)
+        for i in range(0, min(weight.shape[0], weight.shape[1])):
+            weight[i,i,0,0] = 1
+    else:
+        weight = None
+    if test_conf_dict['synthetic_inputs']:
+        import torch
+        inputs = torch.zeros((1, test_conf.in_channel, test_conf.in_height, test_conf.in_width), dtype=torch.int64)
+        for i in range(test_conf.in_channel):
+            inputs[:, i,0,0] = i
+    else:
+        inputs = None
+    test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors, weight=weight, input=inputs)
     if not args.skip_save:
         test.save(args.test_dir)
     if args.headers: