diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py index b41702b..32bcc5a 100644 --- a/test/NeuralEngineFunctionalModel.py +++ b/test/NeuralEngineFunctionalModel.py @@ -61,6 +61,10 @@ def _norm_quant( tensor = tensor >> global_shift + if verbose: + print("INTERMEDIATE RESULTS (after shift):") + print(tensor) + # Saturate into out_type tensor = NeuralEngineFunctionalModel._cast(tensor, out_type, saturate=True) diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py index 028c7a3..d5f266b 100644 --- a/test/NeurekaMemoryLayout.py +++ b/test/NeurekaMemoryLayout.py @@ -24,7 +24,7 @@ class NeurekaMemoryLayout: _WEIGHT_BANDWIDTH = 256 _CIN_SUBTILE_1x1 = 32 - _CIN_SUBTILE_3x3 = 28 + _CIN_SUBTILE_3x3 = 32 @staticmethod def weightEncode( @@ -84,26 +84,16 @@ def weightEncode( constant_values=0, ) elif height == 1 and width == 1: - # Tile cinSubtile into tiles of size 4 - # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile) - weight = weight.reshape( - cout, cinMajor, bits, height * width, cinSubtile // 4, 4 - ) # cout, cinMajor, bits, 1, 8, 4 - # Pad bits to 8 - if bits < 8: - # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile) - weight = np.pad( - weight, - ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)), - mode="constant", - constant_values=0, - ) - # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile) - weight = weight.transpose(0, 1, 3, 4, 2, 5) + # (cout * cinMajor, Bits * cinSubtile) + weight = weight.reshape(-1, bits * cinSubtile) + # Pad only the last dimension to weight bandwidth size # (-1, Weight Bandwidth) - weight = weight.reshape( - cout * cinMajor, NeurekaMemoryLayout._WEIGHT_BANDWIDTH - ) # cout*cinMajor, 256b + weight = np.pad( + weight, + ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH - weight.shape[-1])), + "constant", + constant_values=0, + ) # Prepare for packing # (-1, Weight Bandwidth Bytes, 8) diff --git a/test/NeurekaMemoryLayoutSiracusa.py b/test/NeurekaMemoryLayoutSiracusa.py new file mode 100644 index 0000000..748bd34 --- /dev/null +++ b/test/NeurekaMemoryLayoutSiracusa.py @@ -0,0 +1,156 @@ +# Luka Macan +# Arpan Suravi Prasad +# +# Copyright 2023 ETH Zurich and University of Bologna +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import numpy.typing as npt + + +class NeurekaMemoryLayoutSiracusa: + _WEIGHT_BANDWIDTH = 256 + _CIN_SUBTILE_1x1 = 32 + _CIN_SUBTILE_3x3 = 28 + + @staticmethod + def weightEncode( + weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False + ) -> npt.NDArray[np.uint8]: + """Unroll weight into expected memory format + + Expected weight shape is (cout, cin, H, W). + The produced memory layout depends on the weight kernel shape: + - 3x3: (cout, cinMajor, Bits, H x W x cinMinor_3x3 packed into Weight Bandwidth bits), + - 1x1: (cout, cinMajor, Bits x H x W x cinMinor_1x1 packed into Weight Bandwidth bits), + where cinMajor is the ceil(cin / cin subtile ) and cinMinor has to be padded with 0 to cin subtile . + """ + if depthwise: + weight = weight.transpose(1, 0, 2, 3) # Swap cout and cin + + cout, cin, height, width = weight.shape + cinSubtile = ( + NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_3x3 + if height == 3 + else NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_1x1 + ) + + # Pad cin to be divisible with CIN_SUBTILE + if cin % cinSubtile != 0: + cinPad = cinSubtile - cin % cinSubtile + weight = np.pad( + weight, + ((0, 0), (0, cinPad), (0, 0), (0, 0)), + "constant", + constant_values=0, + ) + + # Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1) + # The 1 at the end is required by the unpacking + cinMajor = int(np.ceil(cin / cinSubtile)) + weight = weight.reshape(cout, cinMajor, cinSubtile, height * width, 1) + + # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0] + # (cout, cinMajor, cinSubtile, Flattened spatial, Bits) + weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little") + + # Shuffle bits so that the final shape is: + # (cout, cinMajor, Bits, Flattened spatial, cinSubtile) + weight = weight.transpose(0, 1, 4, 3, 2) + + # Pack dimensions to fit into weight bandwidth + if height == 3 and width == 3: + # (cout * cinMajor * Bits, H * W * cinSubtile) + weight = weight.reshape(-1, height * width * cinSubtile) + # Pad only the last dimension to weight bandwidth size + # (-1, Weight Bandwidth) + weight = np.pad( + weight, + ((0, 0), (0, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH - weight.shape[-1])), + "constant", + constant_values=0, + ) + elif height == 1 and width == 1: + # Tile cinSubtile into tiles of size 4 + # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile) + weight = weight.reshape( + cout, cinMajor, bits, height * width, cinSubtile // 4, 4 + ) # cout, cinMajor, bits, 1, 8, 4 + # Pad bits to 8 + if bits < 8: + # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile) + weight = np.pad( + weight, + ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)), + mode="constant", + constant_values=0, + ) + # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile) + weight = weight.transpose(0, 1, 3, 4, 2, 5) + # (-1, Weight Bandwidth) + weight = weight.reshape( + cout * cinMajor, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH + ) # cout*cinMajor, 256b + + # Prepare for packing + # (-1, Weight Bandwidth Bytes, 8) + weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH / 8)) + weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2) + + # Pack bits + # (-1, Weight Bandwidth Bytes) + weight = np.packbits(weight, axis=-1, bitorder="little") + + return weight.flatten() + + @staticmethod + def weightDecode( + weight: npt.NDArray[np.uint8], + bits: int, + cout: int, + cin: int, + height: int, + width: int, + ) -> npt.NDArray[np.uint8]: + """Reverse of weightEncode""" + cinSubtile = ( + NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_3x3 + if height == 3 + else NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_1x1 + ) + cinMajor = int(np.ceil(cin / cinSubtile)) + cinMinor = cinSubtile + weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH / 8)) + + weight = weight.reshape(-1, weightBandwidthBytes, 1) + weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little") + weight = weight.reshape(-1, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH) + + if height == 3 and width == 3: + weight = weight[:, : height * width * cinMinor] + weight = weight.reshape( + cout, cinMajor, bits, height * width, cinMinor + ).transpose(0, 1, 4, 3, 2) + elif height == 1 and width == 1: + weight = weight[:, : height * width * cinMinor * 8] + weight = weight.reshape(cout, cinMajor, cinMinor // 4, 8, 4).transpose( + 0, 1, 2, 4, 3 + ) + weight = np.packbits(weight, axis=-1, bitorder="little") + weight = weight.reshape(cout, cinMajor * cinMinor, height, width) + weight = weight[:, :cin, :, :] + + return weight diff --git a/test/testgen.py b/test/testgen.py index 521aecc..c128ff4 100644 --- a/test/testgen.py +++ b/test/testgen.py @@ -20,6 +20,8 @@ import json import os from typing import Optional, Set, Type, Union +import numpy as np +import sys import toml @@ -52,6 +54,7 @@ def headers_gen( def print_tensors(test: NnxTest): + np.set_printoptions(threshold=sys.maxsize) print("INPUT TENSOR:") print(test.input) print("WEIGHT TENSOR:") @@ -83,7 +86,21 @@ def test_gen( exit(-1) test_conf = nnxTestConfCls.model_validate(test_conf_dict) - test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors) + if test_conf_dict['synthetic_weights']: + import torch + weight = torch.zeros((test_conf.out_channel, 1 if test_conf.depthwise else test_conf.in_channel, test_conf.kernel_shape.height, test_conf.kernel_shape.width), dtype=torch.int64) + for i in range(0, min(weight.shape[0], weight.shape[1])): + weight[i,i,0,0] = 1 + else: + weight = None + if test_conf_dict['synthetic_inputs']: + import torch + inputs = torch.zeros((1, test_conf.in_channel, test_conf.in_height, test_conf.in_width), dtype=torch.int64) + for i in range(test_conf.in_channel): + inputs[:, i,0,0] = i + else: + inputs = None + test = NnxTestGenerator.from_conf(test_conf, verbose=args.print_tensors, weight=weight, input=inputs) if not args.skip_save: test.save(args.test_dir) if args.headers: