diff --git a/.gitignore b/.gitignore index 89807f12..0f4057bc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] +*.pyc *$py.class .idea @@ -152,4 +153,7 @@ docs/make.bat html/ # debug file -debug* \ No newline at end of file +debug* + +# cacti cache files for imc +zigzag/classes/cacti/cacti_master/self_gen/ diff --git a/README.md b/README.md index 46a0ac9d..7458843d 100644 --- a/README.md +++ b/README.md @@ -53,3 +53,8 @@ L. Mei, K. Goetschalckx, A. Symons and M. Verhelst, " DeFiNES: Enabling Fast Exp A. Symons, L. Mei, S. Colleman, P. Houshmand, S. Karl and M. Verhelst, “Towards Heterogeneous Multi-core Accelerators Exploiting Fine-grained Scheduling of Layer-Fused Deep Neural Networks”, arXiv e-prints, 2022. doi:10.48550/arXiv.2212.10612. [paper](https://arxiv.org/abs/2212.10612), [github](https://github.com/ZigZag-Project/stream) S. Karl, A. Symons, N. Fasfous and M. Verhelst, "Genetic Algorithm-based Framework for Layer-Fused Scheduling of Multiple DNNs on Multi-core Systems," 2023 Design, Automation & Test in Europe Conference & Exhibition (DATE), Antwerp, Belgium, 2023, pp. 1-6, doi: 10.23919/DATE56975.2023.10137070. [paper](https://ieeexplore.ieee.org/document/10137070), [slides](https://www.dropbox.com/s/rv8qiko59h4pp0s/Genetic%20Algorithm-based%20Framework%20for.pptx?dl=0), [video](https://www.dropbox.com/s/12v94stvevj9xns/Genetic%20Algorithm-based%20Framework%20for.mp4?dl=0) + +#### Extend ZigZag to support In-Memory-Computing cores +J. Sun, P. Houshmand and M. Verhelst, "Analog or Digital In-Memory Computing? Benchmarking through Quantitative Modeling," Proceedings of the IEEE/ACM Internatoinal Conference On Computer Aided Design (ICCAD), October 2023. [paper](https://ieeexplore.ieee.org/document/10323763), [poster](https://drive.google.com/file/d/1EVdua-y2Wg8WL-ovUIw7KUR9kpnpN4AS/view?usp=sharing), [slides](https://docs.google.com/presentation/d/19OXRDh6NCBUIOVGneO3lrZfVT58xh06U/edit?usp=sharing&ouid=108247328431603587200&rtpof=true&sd=true), [video](https://drive.google.com/file/d/10-k4XEPan-O-QAH4Q0uvone36qfNRCpK/view?usp=sharing) + +P. Houshmand, J. Sun and M. Verhelst, "Benchmarking and modeling of analog and digital SRAM in-memory computing architectures," arXiv preprint arXiv:2305.18335 (2023). [paper](https://arxiv.org/abs/2305.18335) diff --git a/tests/main/test_imc/__init__.py b/tests/main/test_imc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/main/test_imc/test_aimc.py b/tests/main/test_imc/test_aimc.py new file mode 100644 index 00000000..afcd1e7c --- /dev/null +++ b/tests/main/test_imc/test_aimc.py @@ -0,0 +1,40 @@ +import pytest + +from zigzag.api import get_hardware_performance_zigzag_imc + +workloads = ( + "zigzag/inputs/examples/workload/alexnet.onnx", + "zigzag/inputs/examples/workload/mobilenetv2.onnx", + "zigzag/inputs/examples/workload/resnet18.onnx", + "zigzag.inputs.examples.workload.resnet18", +) + +# Expected energy, latency (#cycles), clk time and area for each workload defined above +ens_lats_clks_areas = { + "zigzag/inputs/examples/workload/alexnet.onnx": (2557076250.266322, 44012016.0, 6.61184, 0.7892517658006044), + "zigzag/inputs/examples/workload/mobilenetv2.onnx": (802185102.578702, 14939020.0, 6.61184, 0.7892517658006044), + "zigzag/inputs/examples/workload/resnet18.onnx": (2252151728.145326, 62079022.0, 6.61184, 0.7892517658006044), + "zigzag.inputs.examples.workload.resnet18": (2466090000.2577806, 67309272.0, 6.61184, 0.7892517658006044), +} + + +@pytest.fixture +def mapping(): + return "zigzag.inputs.examples.mapping.default_imc" + + +@pytest.fixture +def accelerator(): + return "zigzag.inputs.examples.hardware.Aimc" + + +@pytest.mark.parametrize("workload", workloads) +def test_api(workload, accelerator, mapping): + (energy, latency, tclk, area, cmes) = get_hardware_performance_zigzag_imc( + workload, accelerator, mapping + ) + (expected_energy, expected_latency, expected_tclk, expected_area) = ens_lats_clks_areas[workload] + assert energy == pytest.approx(expected_energy) + assert latency == pytest.approx(expected_latency) + assert tclk == pytest.approx(expected_tclk) + assert area == pytest.approx(expected_area) diff --git a/tests/main/test_imc/test_dimc.py b/tests/main/test_imc/test_dimc.py new file mode 100644 index 00000000..39cf0abd --- /dev/null +++ b/tests/main/test_imc/test_dimc.py @@ -0,0 +1,40 @@ +import pytest + +from zigzag.api import get_hardware_performance_zigzag_imc + +workloads = ( + "zigzag/inputs/examples/workload/alexnet.onnx", + "zigzag/inputs/examples/workload/mobilenetv2.onnx", + "zigzag/inputs/examples/workload/resnet18.onnx", + "zigzag.inputs.examples.workload.resnet18", +) + +# Expected energy, latency (#cycles), clk time and area for each workload defined above +ens_lats_clks_areas = { + "zigzag/inputs/examples/workload/alexnet.onnx": (2340181787.2719307, 72692592.0, 3.2026, 0.785592664), + "zigzag/inputs/examples/workload/mobilenetv2.onnx": (703506891.3687075, 28005964.0, 3.2026, 0.785592664), + "zigzag/inputs/examples/workload/resnet18.onnx": (1828766840.9463186, 120700590.0, 3.2026, 0.785592664), + "zigzag.inputs.examples.workload.resnet18": (2008581031.8287854, 130747736.0, 3.2026, 0.785592664), +} + + +@pytest.fixture +def mapping(): + return "zigzag.inputs.examples.mapping.default_imc" + + +@pytest.fixture +def accelerator(): + return "zigzag.inputs.examples.hardware.Dimc" + + +@pytest.mark.parametrize("workload", workloads) +def test_api(workload, accelerator, mapping): + (energy, latency, tclk, area, cmes) = get_hardware_performance_zigzag_imc( + workload, accelerator, mapping + ) + (expected_energy, expected_latency, expected_tclk, expected_area) = ens_lats_clks_areas[workload] + assert energy == pytest.approx(expected_energy) + assert latency == pytest.approx(expected_latency) + assert tclk == pytest.approx(expected_tclk) + assert area == pytest.approx(expected_area) diff --git a/zigzag/api.py b/zigzag/api.py index d3856c80..e6274ae8 100644 --- a/zigzag/api.py +++ b/zigzag/api.py @@ -81,6 +81,84 @@ def get_hardware_performance_zigzag( return cmes[0][0].energy_total, cmes[0][0].latency_total2, cmes +def get_hardware_performance_zigzag_imc( + workload, + accelerator, + mapping, + opt="latency", + dump_filename_pattern="outputs/layer_?.json", + pickle_filename="outputs/list_of_cmes.pickle", +): + # Initialize the logger + import logging as _logging + + _logging_level = _logging.INFO + _logging_format = ( + "%(asctime)s - %(funcName)s +%(lineno)s - %(levelname)s - %(message)s" + ) + _logging.basicConfig(level=_logging_level, format=_logging_format) + + # Sanity check on the optimization criterion + if opt == "energy": + opt_stage = MinimalEnergyStage + elif opt == "latency": + opt_stage = MinimalLatencyStage + elif opt == "EDP": + opt_stage = MinimalEDPStage + else: + raise NotImplementedError( + "Optimization criterion 'opt' should be either 'energy' or 'latency' or 'EDP'." + ) + + # Check workload format and based on it select the correct workload parser stage + try: + if workload.split(".")[-1] == "onnx": + workload_parser_stage = ONNXModelParserStage + else: + workload_parser_stage = WorkloadParserStage + except: + workload_parser_stage = WorkloadParserStage + + mainstage = MainStage( + [ # Initialize the MainStage as entry point + workload_parser_stage, # Parse the ONNX Model into the workload + AcceleratorParserStage, # Parse the accelerator module/passthrough given accelerator + SimpleSaveStage, # Save the summed CME energy and latency to a json + PickleSaveStage, # Save all received CMEs in a list to a pickle file + SumStage, # Sum up the received best CME across all layers of the workload + SearchUnusedMemoryStage, # Detect unnecessary memory instances + WorkloadStage, # Iterate through the different layers in the workload + RemoveUnusedMemoryStage, # Remove unnecessary memory instances + CompleteSaveStage, # Save each processed layer to a json + opt_stage, # Reduce all CMEs, returning minimal energy/latency one + SpatialMappingGeneratorStage, # Generate multiple spatial mappings (SM) + opt_stage, # Reduce all CMEs, returning minimal energy/latency one + LomaStage, # Generate multiple temporal mappings (TM) + # TemporalOrderingConversionStage, # Based on the fixed temporal mapping order, generate one temporal mapping (TM) + CostModelStage, # Evaluate generated SM and TM through cost model + ], + accelerator=accelerator, # required by AcceleratorParserStage + workload=workload, # required by workload_parser_stage + mapping=mapping, # required by workload_parser_stage + dump_filename_pattern=dump_filename_pattern, # output file save pattern + pickle_filename=pickle_filename, # filename for pickled list of cmes + loma_lpf_limit=6, # required by LomaStage + enable_mix_spatial_mapping_generation=True, # enable auto-generation of mix spatial mapping + maximize_hardware_utilization=True, # only evaluate spatial mapping with top2 utilization (fast simulation) + enable_weight_diagonal_mapping=True, # required by SpatialMappingGeneratorStage + loma_show_progress_bar=True, + # If we need access the same input data multiple times from the innermost memory level and the data size is smaller than the memory read bw, + # take into account only one-time access cost (assume the data can stay at the output pins of the memory as long as it is needed). + # By default, if the parameter is not defined, it will be set as False internally. + access_same_data_considered_as_no_access=True, + ) + + # Launch the MainStage + answers = mainstage.run() + # Get CME from answer + cmes = answers + + return cmes[0][0].energy_total, cmes[0][0].latency_total2, cmes[0][0].tclk, cmes[0][0].area_total, cmes def get_hardware_performance_zigzag_pe_array_scaling( workload, diff --git a/zigzag/classes/cost_model/cost_model.py b/zigzag/classes/cost_model/cost_model.py index 10aba134..fee6c76c 100644 --- a/zigzag/classes/cost_model/cost_model.py +++ b/zigzag/classes/cost_model/cost_model.py @@ -219,7 +219,7 @@ def calc_MUW_union(port_duty_list): # * mapping: The combined spatial and temporal mapping object where access patterns are computed. # # The following cost model attributes are also initialized: -# - energy_breakdown: The energy breakdown for all operands +# - mem_energy_breakdown: The energy breakdown for all operands # - energy: The total energy # # After initialization, the cost model evaluation is run. @@ -323,8 +323,8 @@ def __jsonrepr__(self): "energy_total": self.energy_total, "operational_energy": self.MAC_energy, "memory_energy": self.mem_energy, - "energy_breakdown_per_level": self.energy_breakdown, - "energy_breakdown_per_level_per_operand": self.energy_breakdown_further, + "memory_energy_breakdown_per_level": self.mem_energy_breakdown, + "memory_energy_breakdown_per_level_per_operand": self.mem_energy_breakdown_further, }, "latency": { "data_onloading": self.latency_total1 - self.latency_total0, @@ -343,8 +343,8 @@ def __jsonrepr__(self): "inputs": { "accelerator": self.accelerator, "layer": self.layer, - "spatial_mapping": self.spatial_mapping - if hasattr(self, "spatial_mapping") + "spatial_mapping": self.spatial_mapping_int + if hasattr(self, "spatial_mapping_int") else None, "temporal_mapping": self.temporal_mapping if hasattr(self, "temporal_mapping") @@ -358,7 +358,6 @@ def __simplejsonrepr__(self): ## Run the cost model evaluation. def run(self): - # - TODO: Latency calculation self.calc_memory_utilization() self.calc_memory_word_access() self.calc_energy() @@ -627,15 +626,15 @@ def calc_MAC_energy_cost(self): ## Computes the memories reading/writing energy by converting the access patterns in self.mapping to # energy breakdown using the memory hierarchy of the core on which the layer is mapped. # - # The energy breakdown is saved in self.energy_breakdown. + # The energy breakdown is saved in self.mem_energy_breakdown. # # The energy total consumption is saved in self.energy_total. def calc_memory_energy_cost(self): core = self.accelerator.get_core(self.core_id) mem_hierarchy = core.memory_hierarchy - energy_breakdown = {} - energy_breakdown_further = {} + mem_energy_breakdown = {} + mem_energy_breakdown_further = {} energy_total = 0 for (layer_op, mem_access_list_per_op) in self.memory_word_access.items(): """Retrieve the memory levels in the hierarchy for this memory operand""" @@ -686,10 +685,10 @@ def calc_memory_energy_cost(self): ) ) # here it contains the full split energy_total += total_energy_cost_memory - energy_breakdown[layer_op] = breakdown - energy_breakdown_further[layer_op] = breakdown_further - self.energy_breakdown = energy_breakdown - self.energy_breakdown_further = energy_breakdown_further + mem_energy_breakdown[layer_op] = breakdown + mem_energy_breakdown_further[layer_op] = breakdown_further + self.mem_energy_breakdown = mem_energy_breakdown + self.mem_energy_breakdown_further = mem_energy_breakdown_further self.mem_energy = energy_total self.energy_total = self.mem_energy + self.MAC_energy logger.debug(f"Ran {self}. Total energy = {self.energy_total}") @@ -1139,15 +1138,16 @@ def calc_data_loading_offloading_latency(self): self.data_offloading_cycle = data_offloading_cycle ## This function integrates the previous calculated SScomb, data loading and off-loading cycle to get the overall latency - def calc_overall_latency(self): + def calc_overall_latency(self, cycles_per_mac=1): + # @param cycles_per_mac: cycle counts per mac operand (>1 for bit-serial computation) # the ideal cycle count assuming the MAC array is 100% utilized ideal_cycle = ceil( self.layer.total_MAC_count / self.accelerator.get_core(self.core_id).operational_array.total_unit_count - ) + ) * cycles_per_mac # the ideal temporal cycle count given the spatial mapping (the spatial mapping can be non-ideal) - ideal_temporal_cycle = self.mapping_int.temporal_mapping.total_cycle + ideal_temporal_cycle = self.mapping_int.temporal_mapping.total_cycle * cycles_per_mac MAC_spatial_utilization = ideal_cycle / ideal_temporal_cycle # Total latency without the initial data loading and the final data off-loading @@ -1183,46 +1183,46 @@ def __add__(self, other): ## Energy sum.MAC_energy += other.MAC_energy sum.mem_energy += other.mem_energy - for op in sum.energy_breakdown.keys(): - if op in other.energy_breakdown.keys(): + for op in sum.mem_energy_breakdown.keys(): + if op in other.mem_energy_breakdown.keys(): l = [] for i in range( - min(len(self.energy_breakdown[op]), len(other.energy_breakdown[op])) + min(len(self.mem_energy_breakdown[op]), len(other.mem_energy_breakdown[op])) ): l.append( - self.energy_breakdown[op][i] + other.energy_breakdown[op][i] + self.mem_energy_breakdown[op][i] + other.mem_energy_breakdown[op][i] ) - i = min(len(self.energy_breakdown[op]), len(other.energy_breakdown[op])) - l += self.energy_breakdown[op][i:] - l += other.energy_breakdown[op][i:] - sum.energy_breakdown[op] = l + i = min(len(self.mem_energy_breakdown[op]), len(other.mem_energy_breakdown[op])) + l += self.mem_energy_breakdown[op][i:] + l += other.mem_energy_breakdown[op][i:] + sum.mem_energy_breakdown[op] = l - for op in sum.energy_breakdown_further.keys(): - if op in other.energy_breakdown_further.keys(): + for op in sum.mem_energy_breakdown_further.keys(): + if op in other.mem_energy_breakdown_further.keys(): l = [] for i in range( min( - len(self.energy_breakdown_further[op]), - len(other.energy_breakdown_further[op]), + len(self.mem_energy_breakdown_further[op]), + len(other.mem_energy_breakdown_further[op]), ) ): l.append( - self.energy_breakdown_further[op][i] - + other.energy_breakdown_further[op][i] + self.mem_energy_breakdown_further[op][i] + + other.mem_energy_breakdown_further[op][i] ) i = min( - len(self.energy_breakdown_further[op]), - len(other.energy_breakdown_further[op]), + len(self.mem_energy_breakdown_further[op]), + len(other.mem_energy_breakdown_further[op]), ) - l += self.energy_breakdown_further[op][i:] - l += other.energy_breakdown_further[op][i:] - sum.energy_breakdown_further[op] = l + l += self.mem_energy_breakdown_further[op][i:] + l += other.mem_energy_breakdown_further[op][i:] + sum.mem_energy_breakdown_further[op] = l - # Get all the operands from other that are not in self and add them to the energy breakdown aswell - op_diff = set(other.energy_breakdown.keys()) - set(self.energy_breakdown.keys()) + # Get all the operands from other that are not in self and add them to the energy breakdown as well + op_diff = set(other.mem_energy_breakdown.keys()) - set(self.mem_energy_breakdown.keys()) for op in op_diff: - sum.energy_breakdown[op] = other.energy_breakdown[op] - sum.energy_breakdown_further[op] = other.energy_breakdown_further[op] + sum.mem_energy_breakdown[op] = other.mem_energy_breakdown[op] + sum.mem_energy_breakdown_further[op] = other.mem_energy_breakdown_further[op] sum.energy_total += other.energy_total @@ -1252,7 +1252,7 @@ def __add__(self, other): sum.data_loading_cycle += other.data_loading_cycle sum.data_offloading_cycle += other.data_offloading_cycle sum.ideal_cycle += other.ideal_cycle - sum.ideal_temporal_cycle += other.ideal_temporal_cycle + sum.ideal_temporal_cycle += other.ideal_temporal_cycle # ideal computation cycles without stalling sum.latency_total0 += other.latency_total0 sum.latency_total1 += other.latency_total1 sum.latency_total2 += other.latency_total2 @@ -1295,8 +1295,8 @@ def __add__(self, other): add_attr = [ "MAC_energy", "mem_energy", - "energy_breakdown", - "energy_breakdown_further", + "mem_energy_breakdown", + "mem_energy_breakdown_further", "energy_total", "memory_word_access", "data_loading_cycle", @@ -1335,19 +1335,19 @@ def __mul__(self, number): # Energy mul.MAC_energy *= number mul.mem_energy *= number - mul.energy_breakdown = { + mul.mem_energy_breakdown = { op: [ - mul.energy_breakdown[op][i] * number - for i in range(len(mul.energy_breakdown[op])) + mul.mem_energy_breakdown[op][i] * number + for i in range(len(mul.mem_energy_breakdown[op])) ] - for op in mul.energy_breakdown.keys() + for op in mul.mem_energy_breakdown.keys() } - mul.energy_breakdown_further = { + mul.mem_energy_breakdown_further = { op: [ - mul.energy_breakdown_further[op][i] * number - for i in range(len(mul.energy_breakdown_further[op])) + mul.mem_energy_breakdown_further[op][i] * number + for i in range(len(mul.mem_energy_breakdown_further[op])) ] - for op in mul.energy_breakdown_further.keys() + for op in mul.mem_energy_breakdown_further.keys() } mul.energy_total *= number @@ -1393,8 +1393,8 @@ def __mul__(self, number): mul_attr = [ "MAC_energy", "mem_energy", - "energy_breakdown", - "energy_breakdown_further", + "mem_energy_breakdown", + "mem_energy_breakdown_further", "energy_total", "memory_word_access", "data_loading_cycle", diff --git a/zigzag/classes/cost_model/cost_model_for_sram_imc.py b/zigzag/classes/cost_model/cost_model_for_sram_imc.py new file mode 100644 index 00000000..6e7ece1e --- /dev/null +++ b/zigzag/classes/cost_model/cost_model_for_sram_imc.py @@ -0,0 +1,464 @@ +import logging +from zigzag.utils import pickle_deepcopy +from zigzag.classes.cost_model.cost_model import ( + CostModelEvaluation, PortActivity) + +logger = logging.getLogger(__name__) + +## Class that stores inputs and runs them through the zigzag cost model. +# +# Initialize the cost model evaluation with the following inputs: +# - accelerator: the accelerator that includes the core on which to run the layer +# - layer: the layer to run +# - spatial_mapping: the spatial mapping +# - temporal_mapping: the temporal mapping +# +# From these parameters, the following attributes are computed: +# * core: The core on which the layer is ran. This should be specified in the LayerNode attributes. +# * mapping: The combined spatial and temporal mapping object where access patterns are computed. +# +# The following cost model attributes are also initialized: +# - mem_energy_breakdown: The energy breakdown for all operands +# - energy: The total energy +# +# After initialization, the cost model evaluation is run. +class CostModelEvaluationForIMC(CostModelEvaluation): + + ## The class constructor + # After initialization, the cost model evaluation is run + # @param accelerator the accelerator that includes the core on which to run the + # @param layer the layer to run + # @param spatial_mapping the spatial mapping + # @param temporal_mapping the temporal mapping + # @param access_same_data_considered_as_no_access (optional) + def __init__( + self, + *, + accelerator, + layer, + spatial_mapping, + spatial_mapping_int, + temporal_mapping, + access_same_data_considered_as_no_access=True, + ): + super().__init__(accelerator=accelerator, + layer=layer, + spatial_mapping=spatial_mapping, + spatial_mapping_int=spatial_mapping_int, + temporal_mapping=temporal_mapping, + access_same_data_considered_as_no_access=access_same_data_considered_as_no_access) + + def __str__(self): + return super().__str__() + + def __repr__(self): + return super().__repr__() + + # JSON representation used for saving this object to a json file. + def __jsonrepr__(self): + # latency_total0 breakdown + computation_breakdown = { + "mac_computation": self.ideal_temporal_cycle, + "weight_loading": self.SS_comb, + } + + return { + "outputs": { + "memory": { + "utilization": self.mem_utili_shared + if hasattr(self, "mem_utili_shared") + else None, + "word_accesses": self.memory_word_access, + }, + "energy": { + "energy_total": self.energy_total, + "operational_energy": self.MAC_energy, + "operational_energy_breakdown": self.MAC_energy_breakdown, + "memory_energy": self.mem_energy, + "memory_energy_breakdown_per_level": self.mem_energy_breakdown, + "memory_energy_breakdown_per_level_per_operand": self.mem_energy_breakdown_further, + }, + "latency": { + "data_onloading": self.latency_total1 - self.latency_total0, + "computation": self.latency_total0, + "data_offloading": self.latency_total2 - self.latency_total1, + "computation_breakdown": computation_breakdown, + }, + "clock": { + "tclk (ns)": self.tclk, + "tclk_breakdown (ns)": self.tclk_breakdown, + }, + "area (mm^2)": { + "total_area": self.area_total, + "total_area_breakdown:": { + "imc_area": self.imc_area, + "mem_area": self.mem_area, + }, + "total_area_breakdown_further": { + "imc_area_breakdown": self.imc_area_breakdown, + "mem_area_breakdown": self.mem_area_breakdown, + }, + }, + "spatial": { + "mac_utilization": { + "ideal": self.MAC_spatial_utilization, + "stalls": self.MAC_utilization0, + "stalls_onloading": self.MAC_utilization1, + "stalls_onloading_offloading": self.MAC_utilization2, + } + }, + }, + "inputs": { + "accelerator": self.accelerator, + "layer": self.layer, + "spatial_mapping": self.spatial_mapping_int + if hasattr(self, "spatial_mapping_int") + else None, + "temporal_mapping": self.temporal_mapping + if hasattr(self, "temporal_mapping") + else None, + }, + } + + ## Simple JSON representation used for saving this object to a simple json file. + def __simplejsonrepr__(self): + return {"energy": self.energy_total, "latency": self.latency_total2, "tclk": self.tclk, "area": self.area_total} + + ## Run the cost model evaluation. + def run(self): + super().calc_memory_utilization() + super().calc_memory_word_access() + self.calc_energy() + self.calc_latency() + self.collect_area_data() + + def collect_area_data(self): + # get imc area + operational_array = self.accelerator.get_core(self.core_id).operational_array + self.imc_area = operational_array.total_area + self.imc_area_breakdown = operational_array.area_breakdown + # get mem area + self.mem_area = 0 + self.mem_area_breakdown = {} + for mem in self.mem_level_list: + memory_instance = mem.memory_instance + memory_instance_name = memory_instance.name + self.mem_area += memory_instance.area + self.mem_area_breakdown[memory_instance_name] = memory_instance.area + # get total area + self.area_total = self.imc_area + self.mem_area + + ## Calculates the energy cost of this cost model evaluation by calculating the memory reading/writing energy. + def calc_energy(self): + # - TODO: Interconnection energy + self.calc_MAC_energy_cost() + super().calc_memory_energy_cost() + + ## Calculate the dynamic MAC energy + def calc_MAC_energy_cost(self): + core = self.accelerator.get_core(self.core_id) + self.MAC_energy_breakdown = core.operational_array.unit.get_energy_for_a_layer(self.layer, self.mapping) + self.MAC_energy = sum([energy for energy in self.MAC_energy_breakdown.values()]) + + ## Calculate latency in 4 steps + # + # 1) As we already calculated the ideal data transfer rate in combined_mapping.py (in the Mapping class), + # here we start with calculating the required (or allowed) memory updating window by comparing the effective + # data size with the physical memory size at each level. If the effective data size is smaller than 50% + # of the physical memory size, then we take the whole period as the allowed memory updating window (double buffer effect); + # otherwise we take the the period divided by the top_ir_loop as the allowed memory updating window. + # + # 2) Then, we compute the real data transfer rate given the actual memory bw per functional port pair, + # assuming we have enough memory ports. + # + # 3) In reality, there is no infinite memory port to use. So, as the second step, we combine the real + # data transfer attributes per physical memory port. + # + # 4) Finally, we combine the stall/slack of each memory port to get the final latency. + def calc_latency(self): + super().calc_double_buffer_flag() + super().calc_allowed_and_real_data_transfer_cycle_per_DTL() + # Update the latency model to fit IMC requirement + self.combine_data_transfer_rate_per_physical_port() + super().calc_data_loading_offloading_latency() + # find the cycle count per mac + operational_array = self.accelerator.get_core(self.core_id).operational_array + hd_param = operational_array.unit.hd_param + cycles_per_mac = hd_param["input_precision"] / hd_param["input_bit_per_cycle"] + super().calc_overall_latency(cycles_per_mac=cycles_per_mac) + + ## This function calculate the stalling cycles for IMC (In-Memory-Computing) hardware template + # Consider memory sharing and port sharing, combine the data transfer activity + # Step 1: collect port activity per memory instance per physical memory port + # Step 2: calculate SS combine and MUW union parameters per physical memory port + # Note: this calculation is incorrect when following conditions are ALL true: + # (1) there are more than two mem levels for storing weights, e.g. dram -> cache -> IMC cells + # (2) extra stalling is introduced due to the intermediate mem levels (e.g. due to insifficuent bw of cache) + def combine_data_transfer_rate_per_physical_port(self): + # Step 1: collect port activity per memory instance per physical memory port + port_activity_collect = [] + for mem_instance in self.mem_level_list: + port_activity_single = {} + port_list = mem_instance.port_list + for port in port_list: + port_activity_single[str(port)] = [] + for mem_op, mem_lv, mov_dir in port.served_op_lv_dir: + try: + layer_op = self.mem_op_to_layer_op[mem_op] + except: # mem op to layer might not have this mem op (e.g. pooling layer) + continue + period_count = getattr( + self.mapping_int.unit_mem_data_movement[layer_op][ + mem_lv + ].data_trans_period_count, + mov_dir, + ) + if period_count == 0: + # skip the inactive data movement activities because they won't impact SS + continue + period = getattr( + self.mapping_int.unit_mem_data_movement[layer_op][ + mem_lv + ].data_trans_period, + mov_dir, + ) + real_cycle = getattr( + self.real_data_trans_cycle[layer_op][mem_lv], mov_dir + ) + allowed_cycle = getattr( + self.allowed_mem_updat_cycle[layer_op][mem_lv], mov_dir + ) + port_activity = PortActivity( + real_cycle, + allowed_cycle, + period, + period_count, + layer_op, + mem_lv, + mov_dir, + ) + port_activity_single[str(port)].append(port_activity) + port_activity_collect.append(port_activity_single) + self.port_activity_collect = port_activity_collect + + # Step 2: calculate weight loading cycles + layer_const_operand = self.layer.constant_operands[0] # e.g. "W" + # get spatial mapping in a macro + core = next(iter(self.accelerator.cores)) + operational_array = core.operational_array + memory_hierarchy = core.mem_hierarchy_dict + hd_param = operational_array.unit.hd_param + wl_dim = hd_param["wordline_dimension"] + bl_dim = hd_param["bitline_dimension"] + spatial_mapping_in_macro = [] + for layer_dim, loop in self.layer.user_spatial_mapping.items(): + if layer_dim in [wl_dim, bl_dim]: # serve the dimension inside the macro + if isinstance(loop[0], str): # single layer_dim unrolling + spatial_mapping_in_macro.append(loop) + else: # mix layer_dim unrolling + for element in loop: + spatial_mapping_in_macro.append(element) + # check if there is only one mem level for weight in accelerator. No weight loading required if that is the case. + weight_mem_op = self.layer_op_to_mem_op[layer_const_operand] + weight_mem_hierarchy: list = memory_hierarchy[weight_mem_op] + if len(weight_mem_hierarchy) == 1: # there is only one mem level for weight + require_weight_loading = False + else: + require_weight_loading = True + # check how many times of weight reloading is required + # here assume imc cells is the lowest mem level for weight and rw_port + for imc_port, imc_ports in port_activity_collect[0].items(): # 0: the lowest mem node in the graph + for port in imc_ports: + if port.served_op_lv_dir[2] == "wr_in_by_high": + nb_of_weight_reload_periods = port.period_count + + # get the number of mapped rows in a macro + imc_macro = operational_array.unit + mapped_rows_total = imc_macro.mapped_rows_total + + # get the number of weights stored in each cell group + mapped_group_depth = imc_macro.mapped_group_depth + + # calculate the total number of weight loading cycles + if require_weight_loading: + weight_loading_cycles = nb_of_weight_reload_periods * mapped_rows_total * mapped_group_depth + else: + weight_loading_cycles = 0 + + self.SS_comb = weight_loading_cycles + + # Step 3: fetch tclk information + self.tclk = operational_array.tclk + self.tclk_breakdown = operational_array.tclk_breakdown + + def __add__(self, other): + sum = pickle_deepcopy(self) + + ## Energy + sum.MAC_energy += other.MAC_energy + sum.mem_energy += other.mem_energy + for op in sum.MAC_energy_breakdown.keys(): + if op in other.MAC_energy_breakdown.keys(): + sum.MAC_energy_breakdown[op] = self.MAC_energy_breakdown[op] + other.MAC_energy_breakdown[op] + + for op in sum.mem_energy_breakdown.keys(): + if op in other.mem_energy_breakdown.keys(): + l = [] + for i in range( + min(len(self.mem_energy_breakdown[op]), len(other.mem_energy_breakdown[op])) + ): + l.append( + self.mem_energy_breakdown[op][i] + other.mem_energy_breakdown[op][i] + ) + i = min(len(self.mem_energy_breakdown[op]), len(other.mem_energy_breakdown[op])) + l += self.mem_energy_breakdown[op][i:] + l += other.mem_energy_breakdown[op][i:] + sum.mem_energy_breakdown[op] = l + + for op in sum.mem_energy_breakdown_further.keys(): + if op in other.mem_energy_breakdown_further.keys(): + l = [] + for i in range( + min( + len(self.mem_energy_breakdown_further[op]), + len(other.mem_energy_breakdown_further[op]), + ) + ): + l.append( + self.mem_energy_breakdown_further[op][i] + + other.mem_energy_breakdown_further[op][i] + ) + i = min( + len(self.mem_energy_breakdown_further[op]), + len(other.mem_energy_breakdown_further[op]), + ) + l += self.mem_energy_breakdown_further[op][i:] + l += other.mem_energy_breakdown_further[op][i:] + sum.mem_energy_breakdown_further[op] = l + + # Get all the operands from other that are not in self and add them to the energy breakdown as well + op_diff = set(other.mem_energy_breakdown.keys()) - set(self.mem_energy_breakdown.keys()) + for op in op_diff: + sum.mem_energy_breakdown[op] = other.mem_energy_breakdown[op] + sum.mem_energy_breakdown_further[op] = other.mem_energy_breakdown_further[op] + + op_diff = set(other.MAC_energy_breakdown.keys()) - set(self.MAC_energy_breakdown.keys()) + for op in op_diff: + sum.MAC_energy_breakdown[op] = other.MAC_energy_breakdown[op] + + sum.energy_total += other.energy_total + + ## Memory access + for op in sum.memory_word_access.keys(): + if op in other.memory_word_access.keys(): + l = [] + for i in range( + min( + len(self.memory_word_access[op]), + len(other.memory_word_access[op]), + ) + ): + l.append( + self.memory_word_access[op][i] + other.memory_word_access[op][i] + ) + i = min( + len(self.memory_word_access[op]), len(other.memory_word_access[op]) + ) + l += self.memory_word_access[op][i:] + l += other.memory_word_access[op][i:] + sum.memory_word_access[op] = l + for op in op_diff: + sum.memory_word_access[op] = other.memory_word_access[op] + + ## Latency + sum.data_loading_cycle += other.data_loading_cycle + sum.data_offloading_cycle += other.data_offloading_cycle + sum.ideal_cycle += other.ideal_cycle + sum.SS_comb += other.SS_comb # stalling cycles + sum.ideal_temporal_cycle += other.ideal_temporal_cycle # ideal computation cycles without stalling + sum.latency_total0 += other.latency_total0 + sum.latency_total1 += other.latency_total1 + sum.latency_total2 += other.latency_total2 + + ## MAC utilization + sum.MAC_spatial_utilization = sum.ideal_cycle / sum.ideal_temporal_cycle + sum.MAC_utilization0 = sum.ideal_cycle / sum.latency_total0 + sum.MAC_utilization1 = sum.ideal_cycle / sum.latency_total1 + sum.MAC_utilization2 = sum.ideal_cycle / sum.latency_total2 + + ## layer + if type(sum.layer) != list: + sum.layer = [sum.layer.id] + if type(other.layer) != list: + other_layer = [other.layer.id] + sum.layer += other_layer + + ## core_id + if type(sum.core_id) != list: + sum.core_id = [sum.core_id] + if type(other.layer) != list: + other_core_id = [other.core_id] + sum.core_id += other_core_id + + ## Not addable + func = [ + "calc_allowed_and_real_data_transfer_cycle_per_DTL", + "calc_data_loading_offloading_latency", + "calc_double_buffer_flag", + "calc_overall_latency", + "calc_MAC_energy_cost", + "calc_energy", + "calc_latency", + "calc_memory_energy_cost", + "calc_memory_utilization", + "calc_memory_word_access", + "combine_data_transfer_rate_per_physical_port", + "collect_area_data", + "run", + ] + add_attr = [ + "MAC_energy", + "mem_energy", + "MAC_energy_breakdown", + "mem_energy_breakdown", + "mem_energy_breakdown_further", + "energy_total", + "memory_word_access", + "data_loading_cycle", + "data_offloading_cycle", + "ideal_cycle", + "ideal_temporal_cycle", + "SS_comb", + "latency_total0", + "latency_total1", + "latency_total2", + "tclk", + "tclk_breakdown", + "MAC_spatial_utilization", + "MAC_utilization0", + "MAC_utilization1", + "MAC_utilization2", + "area_total", + "imc_area", + "mem_area", + "imc_area_breakdown", + "mem_area_breakdown", + "layer", + "core_id", + ] + + if hasattr(self, "accelerator") and hasattr(other, "accelerator"): + if self.accelerator.name.startswith(other.accelerator.name): + sum.accelerator = other.accelerator + add_attr.append("accelerator") + elif other.accelerator.name.startswith(self.accelerator.name): + add_attr.append("accelerator") + else: + pass + + for attr in dir(sum): + if attr not in (func + add_attr) and attr[0] != "_": + delattr(sum, attr) + + return sum + diff --git a/zigzag/classes/hardware/architecture/AimcArray.py b/zigzag/classes/hardware/architecture/AimcArray.py new file mode 100644 index 00000000..dfda7a8f --- /dev/null +++ b/zigzag/classes/hardware/architecture/AimcArray.py @@ -0,0 +1,420 @@ +import numpy as np +import math +import copy +if __name__ == "__main__": + from imc_unit import ImcUnit + from DimcArray import DimcArray + import logging as _logging + _logging_level = _logging.INFO + _logging_format = '%(asctime)s - %(funcName)s +%(lineno)s - %(levelname)s - %(message)s' + _logging.basicConfig(level=_logging_level, + format=_logging_format) +else: + import logging as _logging + from zigzag.classes.hardware.architecture.imc_unit import ImcUnit + from zigzag.classes.hardware.architecture.DimcArray import DimcArray + +############################################################################################################### +# README +# . class AimcArray (defines the energy/area/delay cost of an ADC, a DAC and an AIMC array) +# How to use this file? +# . This file is internally called in ZigZag-IMC framework. +# . It can also be run independently, for mainly debugging. An example is given at the end of the file. +############################################################################################################### + +class AimcArray(ImcUnit): + # definition of an Analog In-SRAM-Computing (DIMC) core + # constraint: + # -- activation precision must be in the power of 2. + # -- input_bit_per_cycle must be in the power of 2. + def __init__(self,tech_param:dict, hd_param:dict, dimensions:dict): + # @param tech_param: technology related parameters + # @param hd_param: IMC cores' parameters + # @param dimensions: IMC cores' dimensions + super().__init__(tech_param, hd_param, dimensions) + + def __jsonrepr__(self): + """ + JSON Representation of this class to save it to a json file. + """ + # not implemented + # return {"operational_unit": self.unit, "dimensions": self.dimensions} + pass + + def get_adc_cost(self): + """single ADC cost calculation""" + """area (mm^2)""" + if self.hd_param["adc_resolution"] == 1: + adc_area = 0 + else: # formula extracted and validated against 3 AIMC papers on 28nm + k1 = -0.0369 + k2 = 1.206 + adc_area = 10**(k1*self.hd_param["adc_resolution"]+k2) * 2**self.hd_param["adc_resolution"] * (10**-6) # unit: mm^2 + """delay (ns)""" + k3 = 0.00653 # ns + k4 = 0.640 # ns + adc_delay = self.hd_param["adc_resolution"] * (k3*self.dimensions["D2"] + k4) # unit: ns + """energy (fJ)""" + k5 = 100 # fF + k6 = 0.001 # fF + adc_energy = (k5 * self.hd_param["adc_resolution"] + k6 * 4**self.hd_param["adc_resolution"]) * self.logic_unit.tech_param["vdd"]**2 # unit: fJ + adc_energy = adc_energy/1000 # unit: pJ + return adc_area, adc_delay, adc_energy + + def get_dac_cost(self): + """single DAC cost calculation""" + """area (mm^2)""" + dac_area = 0 # neglected + """delay (ns)""" + dac_delay = 0 # neglected + """energy (fJ)""" + if self.hd_param["input_bit_per_cycle"] == 1: + dac_energy = 0 + else: + k0 = 50e-3 # pF + dac_energy = k0 * self.hd_param["input_bit_per_cycle"] * self.logic_unit.tech_param["vdd"]**2 # unit: pJ + return dac_area, dac_delay, dac_energy + + ## get area of AIMC macros (cells, mults, adders, adders_pv, accumulators. Not include input/output regs) + def get_area(self): + # area of cell array + tech_node = self.logic_unit.tech_param["tech_node"] + group_depth = self.hd_param["group_depth"] + w_pres = self.hd_param["weight_precision"] + if self.hd_param["enable_cacti"] == True: + single_cell_array_area = self.get_single_cell_array_cost_from_cacti(tech_node, + self.wl_dim_size, + self.bl_dim_size, + group_depth, + w_pres)[1] + # at this point, we have the area of single cell array. Then multiply it with the number of banks. + area_cells = single_cell_array_area * self.nb_of_banks # total cell array area in the core + else: + # TODO: [TO BE SUPPORTED OR YOU CAN MODIFY YOURSELF] + area_cells = None # user-provided cell array area (from somewhere?) + raise Exception(f"User-provided cell area is not supported yet.") + + # area of multiplier array + area_mults = self.logic_unit.get_1b_multiplier_area() * w_pres * \ + self.wl_dim_size * self.bl_dim_size * self.nb_of_banks + + # area of ADCs + area_adcs = self.get_adc_cost()[0] * w_pres * self.wl_dim_size * self.nb_of_banks + + # area of DACs + area_dacs = self.get_dac_cost()[0] * self.bl_dim_size * self.nb_of_banks + + # area of adders with place values after ADC conversion (type: RCA) + nb_inputs_of_adder_pv = w_pres + if nb_inputs_of_adder_pv == 1: + nb_of_1b_adder_pv = 0 + else: + adder_depth_pv = math.log2(nb_inputs_of_adder_pv) + assert adder_depth_pv % 1 == 0, \ + f"[AimcArray] The value [{nb_inputs_of_adder_pv}] of [weight_precision] is not in the power of 2." + adder_depth_pv = int(adder_depth_pv) # float -> int for simplicity + adder_input_precision = self.hd_param["adc_resolution"] + nb_of_1b_adder_pv = adder_input_precision * (nb_inputs_of_adder_pv - 1) + nb_inputs_of_adder_pv * (adder_depth_pv - 0.5) # nb of 1b adders in a single place-value adder tree + nb_of_1b_adder_pv *= self.wl_dim_size * self.nb_of_banks # multiply with nb_of_adder_trees + area_adders_pv = self.logic_unit.get_1b_adder_area() * nb_of_1b_adder_pv + + # area of accumulators (adder type: RCA) + if self.hd_param["input_bit_per_cycle"] == self.hd_param["input_precision"]: + area_accumulators = 0 + else: + accumulator_output_pres = w_pres + self.hd_param["adc_resolution"] + self.hd_param["input_precision"] # output precision from adders_pv + required shifted bits + nb_of_1b_adder_accumulator = accumulator_output_pres * self.wl_dim_size * self.nb_of_banks + nb_of_1b_reg_accumulator = nb_of_1b_adder_accumulator # number of regs in an accumulator + area_accumulators = self.logic_unit.get_1b_adder_area() * nb_of_1b_adder_accumulator + \ + self.logic_unit.get_1b_reg_area() * nb_of_1b_reg_accumulator + + # total area of imc + self.area_breakdown = { # unit: same with in input hd file + "cells": area_cells, + "mults": area_mults, + "adcs": area_adcs, + "dacs": area_dacs, + "adders_pv":area_adders_pv, + "accumulators": area_accumulators + } + self.area = sum([v for v in self.area_breakdown.values()]) + # return self.area_breakdown + + ## get delay of imc macros (worst path: dacs -> mults -> adcs -> adders -> accumulators) + def get_delay(self): + # delay of dacs + dly_dacs = self.get_dac_cost()[1] + + # delay of multipliers + dly_mults = self.logic_unit.get_1b_multiplier_dly() + + # delay of adcs + dly_adcs = self.get_adc_cost()[1] + + # delay of adders_pv (adder type: RCA, worst path: in-to-sum -> in-to-sum -> ... -> in-to-cout -> cin-to-cout -> ... -> cin-to-cout) + w_pres = self.hd_param["weight_precision"] # weight precision + nb_inputs_of_adder_pv = w_pres + if nb_inputs_of_adder_pv == 1: + dly_adders_pv = 0 + else: + adder_depth_pv = math.log2(nb_inputs_of_adder_pv) + adder_depth_pv = int(adder_depth_pv) # float -> int for simplicity + adder_pv_output_precision = nb_inputs_of_adder_pv + self.hd_param["adc_resolution"] # output precision from adders_pv + dly_adders_pv = (adder_depth_pv-1) * self.logic_unit.get_1b_adder_dly_in2sum() + \ + self.logic_unit.get_1b_adder_dly_in2cout() + \ + (adder_pv_output_precision-1) * self.logic_unit.get_1b_adder_dly_cin2cout() + + # delay of accumulators (adder type: RCA) + if self.hd_param["input_bit_per_cycle"] == self.hd_param["input_precision"]: + dly_accumulators = 0 + else: + accumulator_input_pres = adder_pv_output_precision + accumulator_output_pres = self.hd_param["weight_precision"] + self.hd_param["adc_resolution"] + self.hd_param["input_precision"] # output precision from adders_pv + required shifted bits + dly_accumulators = self.logic_unit.get_1b_adder_dly_in2cout() + \ + (accumulator_output_pres-accumulator_input_pres) * self.logic_unit.get_1b_adder_dly_cin2cout() + + # total delay of imc + self.delay_breakdown = { + "dacs": dly_dacs, + "mults": dly_mults, + "adcs": dly_adcs, + "adders_pv":dly_adders_pv, + "accumulators": dly_accumulators + } + self.delay = sum([v for v in self.delay_breakdown.values()]) + # return self.delay_breakdown + + ## macro-level one-cycle energy of imc arrays (fully utilization, no weight updating) + # (components: cells, mults, adders, adders_pv, accumulators. Not include input/output regs) + def get_peak_energy_single_cycle(self): + layer_const_operand_pres = self.hd_param["weight_precision"] + layer_act_operand_pres = self.hd_param["input_precision"] + """energy of precharging""" + energy_precharging = 0 + + """energy of DACs""" + energy_dacs = self.get_dac_cost()[2] * self.bl_dim_size * self.nb_of_banks + + """energy of cell array (bitline accumulation, type: voltage-based)""" + energy_cells = (self.logic_unit.tech_param["bl_cap"] * (self.logic_unit.tech_param["vdd"] ** 2) * layer_const_operand_pres) * \ + self.wl_dim_size * self.bl_dim_size * self.nb_of_banks + + """energy of ADCs""" + energy_adcs = self.get_adc_cost()[2] * layer_const_operand_pres * self.wl_dim_size * self.nb_of_banks + + """energy of multiplier array""" + energy_mults = (self.logic_unit.get_1b_multiplier_energy() * layer_const_operand_pres) * \ + self.bl_dim_size * self.wl_dim_size * self.nb_of_banks + + """energy of adders_pv (type: RCA)""" + nb_inputs_of_adder_pv = layer_const_operand_pres + if nb_inputs_of_adder_pv == 1: + energy_adders_pv = 0 + else: + adder_pv_input_precision = self.hd_param["adc_resolution"] + nb_of_1b_adder_pv = adder_pv_input_precision * (nb_inputs_of_adder_pv - 1) + nb_inputs_of_adder_pv * (math.log2(nb_inputs_of_adder_pv) - 0.5) + energy_adders_pv = nb_of_1b_adder_pv * self.logic_unit.get_1b_adder_energy() * self.wl_dim_size * self.nb_of_banks + + """energy of accumulators (adder type: RCA)""" + if self.hd_param["input_bit_per_cycle"] == layer_act_operand_pres: + energy_accumulators = 0 + else: + accumulator_output_pres = layer_act_operand_pres + layer_const_operand_pres + math.log2(self.bl_dim_size) + energy_accumulators = (self.logic_unit.get_1b_adder_energy() + self.logic_unit.get_1b_reg_energy()) * accumulator_output_pres * \ + self.wl_dim_size * self.nb_of_banks + + peak_energy_breakdown = { # unit: pJ (the unit borrowed from CACTI) + "precharging": energy_precharging, + "dacs": energy_dacs, + "adcs": energy_adcs, + "analog_bitlines": energy_cells, + "mults": energy_mults, + "adders_pv": energy_adders_pv, + "accumulators": energy_accumulators + } + # peak_energy = sum([v for v in peak_energy_breakdown.values()]) + return peak_energy_breakdown + + ## macro-level peak performance of imc arrays (fully utilization, no weight updating) + def get_macro_level_peak_performance(self): + nb_of_macs_per_cycle = self.wl_dim_size * self.bl_dim_size / \ + (self.hd_param["input_precision"] / self.hd_param["input_bit_per_cycle"]) * \ + self.nb_of_banks + + self.get_area() + self.get_delay() + + clock_cycle_period = self.delay # unit: ns + peak_energy_per_cycle = sum([v for v in self.get_peak_energy_single_cycle().values()]) # unit: pJ + imc_area = self.area # unit: mm^2 + + tops_peak = nb_of_macs_per_cycle * 2 / clock_cycle_period / 1000 + topsw_peak = nb_of_macs_per_cycle * 2 / peak_energy_per_cycle + topsmm2_peak = tops_peak / imc_area + + logger = _logging.getLogger(__name__) + logger.info(f"Current macro-level peak performance:") + logger.info(f"TOP/s: {tops_peak}, TOP/s/W: {topsw_peak}, TOP/s/mm^2: {topsmm2_peak}") + + return tops_peak, topsw_peak, topsmm2_peak + + def get_energy_for_a_layer(self, layer, mapping): + """check if operand precision defined in the layer is supported""" + # currently in the energy model, the input and weight precision defined in the workload file should be the same with in the hd input file. + # this check can be removed if variable precision is supported in the future. + + # activation/weight representation + layer_act_operand, layer_const_operand = DimcArray.identify_layer_operand_representation(layer) + + layer_const_operand_pres = layer.operand_precision[layer_const_operand] + layer_act_operand_pres = layer.operand_precision[layer_act_operand] + weight_pres_in_hd_param = self.hd_param["weight_precision"] + act_pres_in_hd_param = self.hd_param["input_precision"] + + # currently in the energy model, the input and weight precision defined in the workload file should be the same with in the hd input file. + # this check can be removed if variable precision is supported in the future. + assert layer_const_operand_pres == weight_pres_in_hd_param, \ + f"Weight precision defined in the workload [{layer_const_operand_pres}] not equal to the one defined in the hardware hd_param [{weight_pres_in_hd_param}]." + assert layer_act_operand_pres == act_pres_in_hd_param, \ + f"Activation precision defined in the workload [{layer_act_operand_pres}] not equal to the one defined in the hardware hd_param [{act_pres_in_hd_param}]." + + """parameter extraction""" + mapped_rows_total, mapped_rows_for_adder, mapped_cols, macro_activation_times = DimcArray.get_mapped_oa_dim( + layer, self.wl_dim, self.bl_dim) + self.mapped_rows_total = mapped_rows_total + + """energy calculation""" + """energy of precharging""" + energy_precharging, mapped_group_depth = DimcArray.get_precharge_energy(self.hd_param, self.logic_unit.tech_param, layer, mapping) + self.mapped_group_depth = mapped_group_depth + + """energy of DACs""" + energy_dacs = self.get_dac_cost()[2] * mapped_rows_total * \ + layer_act_operand_pres / self.hd_param["input_bit_per_cycle"] * macro_activation_times + + """energy of cell array (bitline accumulation, type: voltage-based)""" + energy_cells = (self.logic_unit.tech_param["bl_cap"] * (self.logic_unit.tech_param["vdd"]**2) * layer_const_operand_pres) * \ + mapped_cols * \ + self.bl_dim_size * \ + layer_act_operand_pres / self.hd_param["input_bit_per_cycle"] * \ + macro_activation_times + + """energy of ADCs""" + energy_adcs = self.get_adc_cost()[2] * layer_const_operand_pres * mapped_cols * \ + layer_act_operand_pres / self.hd_param["input_bit_per_cycle"] * macro_activation_times + + """energy of multiplier array""" + energy_mults = (self.logic_unit.get_1b_multiplier_energy() * layer_const_operand_pres) *\ + (mapped_rows_total * self.wl_dim_size) * \ + (layer_act_operand_pres / self.hd_param["input_bit_per_cycle"]) * \ + macro_activation_times + + """energy of adders_pv (type: RCA)""" + nb_inputs_of_adder_pv = layer_const_operand_pres + if nb_inputs_of_adder_pv == 1: + energy_adders_pv = 0 + else: + adder_pv_input_precision = self.hd_param["adc_resolution"] + nb_of_1b_adder_pv = adder_pv_input_precision * (nb_inputs_of_adder_pv-1) + nb_inputs_of_adder_pv*(math.log2(nb_inputs_of_adder_pv)-0.5) + energy_adders_pv = nb_of_1b_adder_pv * self.logic_unit.get_1b_adder_energy() * mapped_cols * \ + layer_act_operand_pres / self.hd_param["input_bit_per_cycle"] * macro_activation_times + + """energy of accumulators (adder type: RCA)""" + if self.hd_param["input_bit_per_cycle"] == layer_act_operand_pres: + energy_accumulators = 0 + else: + accumulator_output_pres = layer_act_operand_pres + layer_const_operand_pres + math.log2(self.bl_dim_size) + energy_accumulators = (self.logic_unit.get_1b_adder_energy() + self.logic_unit.get_1b_reg_energy()) * accumulator_output_pres * \ + mapped_cols * \ + layer_act_operand_pres / self.hd_param["input_bit_per_cycle"] * macro_activation_times + + self.energy_breakdown = { # unit: pJ (the unit borrowed from CACTI) + "precharging": energy_precharging, + "dacs": energy_dacs, + "adcs": energy_adcs, + "analog_bitlines": energy_cells, + "mults": energy_mults, + "adders_pv": energy_adders_pv, + "accumulators": energy_accumulators + } + self.energy = sum([v for v in self.energy_breakdown.values()]) + return self.energy_breakdown + +if __name__ == "__main__": +# +##### IMC hardware dimension illustration (keypoint: adders' accumulation happens on D2) +# +# |<------------------------ D1 ----------------------------->| (nb_of_columns/macro = D1 * weight_precision) +# - +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \ +# ^ + + + D3 (nb_of_macros) +# | + ^ +++++++ + + \ +# | + | + W + + + +# | + group_depth +++++++ + + +# | + | + W + + + +# | + v +++++++ + + +# | + | + + +# | + v + + +# | + multipliers -\ + + +# | + . \ + + +# + . - adders (DIMC) + + +# D2 + . / OR adcs (AIMC) + + +# + multipliers -/ | + + +# | + ^ | + + +# | + | | + + +# | + ^ +++++++ v + + +# | + | + W + adders_pv (place value) + + +# | + group_depth +++++++ | + + +# | + | + W + v + + +# | + v +++++++ accumulators + + +# | + | + + +# v + | + + +# - +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +# + | + + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# (nb_of_rows/macro = D2 * group_depth) | +# v +# outputs +# + tech_param_28nm = { + "tech_node":0.028, # unit: um + "vdd": 0.9, # unit: V + "nd2_cap": 0.7/1e3, # unit: pF + "xor2_cap": 0.7*1.5/1e3, # unit: pF + "dff_cap": 0.7*3/1e3, # unit: pF + "nd2_area": 0.614/1e6, # unit: mm^2 + "xor2_area":0.614*2.4/1e6, # unit: mm^2 + "dff_area": 0.614*6/1e6, # unit: mm^2 + "nd2_dly": 0.0478, # unit: ns + "xor2_dly": 0.0478*2.4, # unit: ns + # "dff_dly": 0.0478*3.4, # unit: ns + } + dimensions = { + "D1": 32/8, # wordline dimension + "D2": 32, # bitline dimension + "D3": 1, # nb_macros + } # {"D1": ("K", 4), "D2": ("C", 32),} + + """hd_param example for AIMC""" + hd_param_aimc = { + "pe_type": "in_sram_computing", # required for CostModelStage + "imc_type": "analog", # "digital" or "analog". Or else: pure digital + "input_precision": 8, # activation precision + "weight_precision": 8, # weight precision + "input_bit_per_cycle": 2, # nb_bits of input/cycle + "group_depth": 1, # m factor + "adc_resolution": 8, # adc resolution + "wordline_dimension": "D1", # wordline dimension + "bitline_dimension": "D2", # bitline dimension + "enable_cacti": True, # use CACTI to estimated cell array area cost (cell array exclude build-in logic part) + } + hd_param_aimc["adc_resolution"] = hd_param_aimc["input_bit_per_cycle"] + 0.5*math.log2(dimensions["D2"]) + aimc = AimcArray(tech_param_28nm, hd_param_aimc, dimensions) + aimc.get_area() + aimc.get_delay() + logger = _logging.getLogger(__name__) + logger.info(f"Total IMC area (mm^2): {aimc.area}") + logger.info(f"area breakdown: {aimc.area_breakdown}") + logger.info(f"delay (ns): {aimc.delay}") + logger.info(f"delay breakdown (ns): {aimc.delay_breakdown}") + aimc.get_macro_level_peak_performance() + exit() diff --git a/zigzag/classes/hardware/architecture/DimcArray.py b/zigzag/classes/hardware/architecture/DimcArray.py new file mode 100644 index 00000000..5cda6579 --- /dev/null +++ b/zigzag/classes/hardware/architecture/DimcArray.py @@ -0,0 +1,678 @@ +import numpy as np +import math +import copy +if __name__ == "__main__" or __name__ == "DimcArray": + # branch when the script is run locally or called by AimcArray.py + from imc_unit import ImcUnit + import logging as _logging + _logging_level = _logging.INFO + _logging_format = '%(asctime)s - %(funcName)s +%(lineno)s - %(levelname)s - %(message)s' + _logging.basicConfig(level=_logging_level, + format=_logging_format) +else: + import logging as _logging + from zigzag.classes.hardware.architecture.imc_unit import ImcUnit + +############################################################################################################### +# README +# . class DimcArray (defines the energy/area/delay cost of a DIMC array) +# How to use this file? +# . This file is internally called in ZigZag-IMC framework. +# . It can also be run independently, for mainly debugging. An example is given at the end of the file. +############################################################################################################### + +class DimcArray(ImcUnit): + # definition of a Digtal In-SRAM-Computing (DIMC) core + # constraint: + # -- activation precision must be in the power of 2. + # -- input_bit_per_cycle must be in the power of 2. + def __init__(self,tech_param:dict, hd_param:dict, dimensions:dict): + # @param tech_param: technology related parameters + # @param hd_param: IMC cores' parameters + # @param dimensions: IMC cores' dimensions + super().__init__(tech_param, hd_param, dimensions) + + def __jsonrepr__(self): + """ + JSON Representation of this class to save it to a json file. + """ + # not implemented + #return {"operational_unit": self.unit, "dimensions": self.dimensions} + pass + + ## area of imc macros (cells, mults, adders, adders_pv, accumulators. Not include input/output regs) + def get_area(self): + # area of cell array + tech_node = self.logic_unit.tech_param["tech_node"] + group_depth = self.hd_param["group_depth"] + w_pres = self.hd_param["weight_precision"] + if self.hd_param["enable_cacti"] == True: + single_cell_array_area = self.get_single_cell_array_cost_from_cacti(tech_node, + self.wl_dim_size, + self.bl_dim_size, + group_depth, + w_pres)[1] + # at this point, we have the area of single cell array. Then multiply it with the number of banks. + area_cells = single_cell_array_area * self.nb_of_banks # total cell array area in the core + else: + # TODO: [TO BE SUPPORTED OR YOU CAN MODIFY YOURSELF] + area_cells = None # user-provided cell array area (from somewhere?) + raise Exception(f"User-provided cell area is not supported yet.") + + """area of multiplier array""" + area_mults = self.logic_unit.get_1b_multiplier_area() * self.hd_param["input_bit_per_cycle"] * \ + w_pres * self.wl_dim_size * self.bl_dim_size * self.nb_of_banks + + """area of adder trees (type: RCA)""" + adder_input_pres = w_pres # input precision of the adder tree + nb_inputs_of_adder = self.bl_dim_size # the number of inputs of the adder tree + adder_depth = math.log2(nb_inputs_of_adder) + assert adder_depth%1==0, \ + f"[DimcArray] The number of inputs [{nb_inputs_of_adder}] for the adder tree is not in the power of 2." + adder_depth = int(adder_depth) # float -> int for simplicity + adder_output_pres = adder_input_pres + adder_depth # output precision of the adder tree + nb_of_1b_adder_in_single_adder_tree = nb_inputs_of_adder * (adder_input_pres+1) - (adder_input_pres+adder_depth+1) # nb of 1b adders in a single adder tree + nb_of_adder_trees = self.hd_param["input_bit_per_cycle"] * self.wl_dim_size * self.nb_of_banks + area_adders = self.logic_unit.get_1b_adder_area() * nb_of_1b_adder_in_single_adder_tree * nb_of_adder_trees + + """area of extra adders with place values (pv) when input_bit_per_cycle>1 (type: RCA)""" + nb_inputs_of_adder_pv = self.hd_param["input_bit_per_cycle"] + if nb_inputs_of_adder_pv == 1: + nb_of_1b_adder_pv = 0 # number of 1b adder in an pv_adder tree + nb_of_adder_trees_pv = 0 # number of pv_adder trees + else: + adder_depth_pv = math.log2(nb_inputs_of_adder_pv) + input_precision_pv = adder_output_pres + assert adder_depth_pv%1==0, \ + f"[DimcArray] The value [{nb_inputs_of_adder_pv}] of [input_bit_per_cycle] is not in the power of 2." + adder_depth_pv = int(adder_depth_pv) # float -> int for simplicity + nb_of_1b_adder_pv = input_precision_pv * (nb_inputs_of_adder_pv-1) + nb_inputs_of_adder_pv * (adder_depth_pv-0.5) # nb of 1b adders in a single place-value adder tree + nb_of_adder_trees_pv = self.wl_dim_size * self.nb_of_banks + area_adders_pv = self.logic_unit.get_1b_adder_area() * nb_of_1b_adder_pv * nb_of_adder_trees_pv + + """area of accumulators (adder type: RCA)""" + if self.hd_param["input_bit_per_cycle"] == self.hd_param["input_precision"]: + area_accumulators = 0 + else: + accumulator_output_pres = self.hd_param["input_precision"]+self.hd_param["weight_precision"]+math.log2(self.bl_dim_size) + nb_of_1b_adder_accumulator = accumulator_output_pres * self.wl_dim_size * self.nb_of_banks # number of 1b adder in all accumulators + nb_of_1b_reg_accumulator = nb_of_1b_adder_accumulator # number of regs in all accumulators + area_accumulators = self.logic_unit.get_1b_adder_area() * nb_of_1b_adder_accumulator + \ + self.logic_unit.get_1b_reg_area() * nb_of_1b_reg_accumulator + """total area of imc""" + self.area_breakdown = { # unit: same with in input hd file + "cells": area_cells, + "mults": area_mults, + "adders": area_adders, + "adders_pv":area_adders_pv, + "accumulators": area_accumulators + } + self.area = sum([v for v in self.area_breakdown.values()]) + # return self.area_breakdown + + def get_delay(self): + """delay of imc arrays (worst path: mults -> adders -> adders_pv -> accumulators) """ + """ unit: ns (if CACTI is used). whatever it can be otherwise. """ + dly_mults = self.logic_unit.get_1b_multiplier_dly() + + """delay of adders (tree) (type: RCA)""" + adder_input_pres = self.hd_param["weight_precision"] + nb_inputs_of_adder = self.bl_dim_size + adder_depth = math.log2(nb_inputs_of_adder) + assert adder_depth%1==0, \ + f"[DimcArray] The number of inputs [{nb_inputs_of_adder}] for the adder tree is not in the power of 2." + adder_depth = int(adder_depth) # float -> int for simplicity + adder_output_pres = adder_input_pres + adder_depth + dly_adders = (adder_depth-1) * self.logic_unit.get_1b_adder_dly_in2sum() + \ + self.logic_unit.get_1b_adder_dly_in2cout() + \ + (adder_output_pres-1-1) * self.logic_unit.get_1b_adder_dly_cin2cout() + + """delay of adders_pv (type: RCA)""" + nb_inputs_of_adder_pv = self.hd_param["input_bit_per_cycle"] + if nb_inputs_of_adder_pv == 1: + dly_adders_pv = 0 + accumulator_input_pres = adder_output_pres + else: + adder_depth_pv = math.log2(nb_inputs_of_adder_pv) + assert adder_depth_pv%1==0, \ + f"[DimcArray] The value [{nb_inputs_of_adder_pv}] of [input_bit_per_cycle] is not in the power of 2." + adder_depth_pv = int(adder_depth_pv) # float -> int for simplicity + adder_pv_input_precision = adder_output_pres + adder_pv_output_precision = nb_inputs_of_adder_pv + adder_output_pres # output precision from adders_pv (depth + input_precision) + accumulator_input_pres = adder_pv_output_precision + dly_adders_pv = (adder_depth_pv - 1) * self.logic_unit.get_1b_adder_dly_in2sum() + self.logic_unit.get_1b_adder_dly_in2cout() + (adder_pv_output_precision - adder_pv_input_precision-1) * self.logic_unit.get_1b_adder_dly_cin2cout() + + """delay of accumulators (adder type: RCA)""" + accumulator_output_pres = self.hd_param["input_precision"] + self.hd_param["weight_precision"] + math.log2(self.bl_dim_size) + accumulator_output_pres = int(accumulator_output_pres) # float -> int for simplicity + if accumulator_output_pres == accumulator_input_pres: # no accumulator + dly_accumulators = 0 + else: + dly_accumulators = self.logic_unit.get_1b_adder_dly_in2cout() + \ + (accumulator_output_pres - accumulator_input_pres) * self.logic_unit.get_1b_adder_dly_cin2cout() + + """total delay of imc""" + self.delay_breakdown = { + "mults": dly_mults, + "adders": dly_adders, + "adders_pv":dly_adders_pv, + "accumulators": dly_accumulators + } + self.delay = sum([v for v in self.delay_breakdown.values()]) + # return self.delay_breakdown + + def get_peak_energy_single_cycle(self): + """ + macro-level one-cycle energy of imc arrays (fully utilization, no weight updating) + (components: cells, mults, adders, adders_pv, accumulators. Not include input/output regs) + """ + w_pres = self.hd_param["weight_precision"] + """energy of precharging""" + energy_precharging = 0 + + """energy of multiplier array""" + nb_of_mults = self.hd_param["input_bit_per_cycle"] * \ + w_pres * self.wl_dim_size * self.bl_dim_size * self.nb_of_banks + energy_mults = self.logic_unit.get_1b_multiplier_energy() * nb_of_mults + + """energy of adder trees (type: RCA)""" + adder_input_pres = w_pres # input precision of the adder tree + nb_inputs_of_adder = self.bl_dim_size # the number of inputs of the adder tree + adder_depth = math.log2(nb_inputs_of_adder) + assert adder_depth%1==0, \ + f"[DimcArray] The number of inputs [{nb_inputs_of_adder}] for the adder tree is not in the power of 2." + adder_depth = int(adder_depth) # float -> int for simplicity + adder_output_pres = adder_input_pres + adder_depth # output precision of the adder tree + nb_of_1b_adder_in_single_adder_tree = nb_inputs_of_adder * (adder_input_pres+1) - (adder_input_pres+adder_depth+1) # nb of 1b adders in a single adder tree + nb_of_adder_trees = self.hd_param["input_bit_per_cycle"] * self.wl_dim_size * self.nb_of_banks + energy_adders = self.logic_unit.get_1b_adder_energy() * nb_of_1b_adder_in_single_adder_tree * nb_of_adder_trees + + """energy of adders_pv (type: RCA)""" + nb_inputs_of_adder_pv = self.hd_param["input_bit_per_cycle"] + if nb_inputs_of_adder_pv == 1: + energy_adders_pv = 0 + else: + adder_pv_input_precision = adder_output_pres + nb_of_1b_adder_pv = adder_pv_input_precision * (nb_inputs_of_adder_pv - 1) + nb_inputs_of_adder_pv * (math.log2(nb_inputs_of_adder_pv) - 0.5) + nb_of_adder_trees_pv = self.wl_dim_size * self.nb_of_banks + energy_adders_pv = self.logic_unit.get_1b_adder_energy() * nb_of_1b_adder_pv * nb_of_adder_trees_pv + + """energy of accumulators (adder type: RCA)""" + if self.hd_param["input_bit_per_cycle"] == self.hd_param["input_precision"]: + energy_accumulators = 0 + else: + accumulator_output_pres = self.hd_param["input_precision"]+self.hd_param["weight_precision"]+math.log2(self.bl_dim_size) + nb_of_1b_adder_accumulator = accumulator_output_pres * self.wl_dim_size * self.nb_of_banks # number of 1b adder in all accumulators + nb_of_1b_reg_accumulator = nb_of_1b_adder_accumulator # number of regs in all accumulators + energy_accumulators = self.logic_unit.get_1b_adder_energy() * nb_of_1b_adder_accumulator + \ + self.logic_unit.get_1b_reg_energy() * nb_of_1b_reg_accumulator + + peak_energy_breakdown = { # unit: pJ (the unit borrowed from CACTI) + "precharging": energy_precharging, + "mults": energy_mults, + "adders": energy_adders, + "adders_pv": energy_adders_pv, + "accumulators": energy_accumulators + } + # peak_energy = sum([v for v in peak_energy_breakdown.values()]) + return peak_energy_breakdown + + def get_macro_level_peak_performance(self): + """ + macro-level peak performance of imc arrays (fully utilization, no weight updating) + """ + nb_of_macs_per_cycle = self.wl_dim_size * self.bl_dim_size / \ + (self.hd_param["input_precision"] / self.hd_param["input_bit_per_cycle"]) * \ + self.nb_of_banks + + self.get_area() + self.get_delay() + + clock_cycle_period = self.delay # unit: ns + peak_energy_per_cycle = sum([v for v in self.get_peak_energy_single_cycle().values()]) # unit: pJ + imc_area = self.area # unit: mm^2 + + tops_peak = nb_of_macs_per_cycle * 2 / clock_cycle_period / 1000 + topsw_peak = nb_of_macs_per_cycle * 2 / peak_energy_per_cycle + topsmm2_peak = tops_peak / imc_area + + logger = _logging.getLogger(__name__) + logger.info(f"Current macro-level peak performance:") + logger.info(f"TOP/s: {tops_peak}, TOP/s/W: {topsw_peak}, TOP/s/mm^2: {topsmm2_peak}") + + return tops_peak, topsw_peak, topsmm2_peak + + @staticmethod + def calculate_mapped_rows_total_when_diagonal_mapping_found(layer, layer_const_operand, layer_act_operand, sm_on_wl_dim, sm_on_bl_dim): + # This function is used for calcualting the total mapped number of rows when OX, OY unroll is found, + # which requires a diagonal data mapping. + # If OX, OY unroll does not exist, you can also use this function to calculate the total mapped number of rows. + # The only drawback is the simulation time is longer. + # First, fetch the dimension name of OX / OY (they are weight ir loops) + weight_ir_layer_dims: list = layer.operand_loop_dim[layer_const_operand]["ir"] + # Second, we will find out what pr loops they pair with. Create a dict to record them down for later use. + # For neural network, OX pairs with FX, OY with FY. So, it is assumed the pair size is in 2. + act_pr_layer_dims: dict = layer.operand_loop_dim[layer_act_operand]["pr"] + pr_sm: dict = {} + pr_sm_link: dict = {} + for [layer_dim1, layer_dim2] in act_pr_layer_dims.values(): + # for weight_ir_layer_dim in weight_ir_layer_dims: + if layer_dim1 in weight_ir_layer_dims: + pr_sm[layer_dim2] = {layer_dim1: 1} # 1 by default, which means no mapping found + pr_sm_link[layer_dim1] = layer_dim2 + else: # layer_dim2 in weight_ir_layer_dims + pr_sm[layer_dim1] = {layer_dim2: 1} # 1 by default, which means no mapping found + pr_sm_link[layer_dim2] = layer_dim1 + # Third, check if they are mapped on wl_dim and record down the mapped value if exist + for weight_ir_layer_dim in weight_ir_layer_dims: + pr_sm_key = pr_sm_link[weight_ir_layer_dim] + if isinstance(sm_on_wl_dim[0], str): # single layer mapping (e.g. ("K", 2)) + if weight_ir_layer_dim == sm_on_wl_dim[0]: + pr_sm[pr_sm_key][weight_ir_layer_dim] = sm_on_wl_dim[1] + else: # mix layer_dim mapping (e.g. (("K",2), ("OX",2)) ) + for element in sm_on_wl_dim: + if weight_ir_layer_dim == element[0]: + # use *= in case there are multiple OX / OY in a mix sm loop + pr_sm[pr_sm_key][weight_ir_layer_dim] *= element[1] + # Then, we calculate the total mapped number of rows + # mapped_rows_total: used for energy estimation of wordline and multipliers + # mapped_rows_for_adder: number of activated inputs of an adder tree, used for energy estimation of adder trees + if isinstance(sm_on_bl_dim[0], str): # single layer mapping + layer_dim = sm_on_bl_dim[0] + layer_dim_size = sm_on_bl_dim[1] + # pr_sm.keys() include FX, FY + if layer_dim not in pr_sm.keys(): # e.g. ("C", 2) + additional_diag_rows = 0 + else: # e.g. ("FX", 2) + additional_diag_rows = list(pr_sm[layer_dim].values())[0] - 1 + mapped_rows_total = layer_dim_size + additional_diag_rows + mapped_rows_for_adder = layer_dim_size + else: # mix layer_dim mapping (e.g. (("C",2), ("FX",2)) ) + # mapped_rows_total = Cu * (OYu + FYu - 1) * (OXu + FXu - 1) + # mapped_rows_for_adder = Cu * FYu * FXu + # In reality, OXu, OYu will not both exist. But the function still support this by the equation above. + mapped_rows_total = 1 + mapped_rows_for_adder = 1 + for element in sm_on_bl_dim: + layer_dim = element[0] + layer_dim_size = element[1] + if layer_dim not in pr_sm.keys(): + additional_diag_rows = 0 + else: + additional_diag_rows = list(pr_sm[layer_dim].values())[0] - 1 + mapped_rows_total *= (layer_dim_size + additional_diag_rows) + mapped_rows_for_adder *= layer_dim_size + # Lastly, ceil to an upper integer, as required in the adder-trees model. + mapped_rows_total = math.ceil(mapped_rows_total) + mapped_rows_for_adder = math.ceil(mapped_rows_for_adder) + return mapped_rows_total, mapped_rows_for_adder + + @staticmethod + def get_mapped_oa_dim(layer, wl_dim, bl_dim): + """ + get the mapped oa_dim in current mapping. The energy of unmapped oa_dim will be set to 0. + """ + + # activation/weight representation + layer_act_operand, layer_const_operand = DimcArray.identify_layer_operand_representation(layer) + + spatial_mapping = copy.deepcopy(layer.user_spatial_mapping) + + # Figure out the spatial mapping in a single macro + spatial_mapping_in_macro = [] + for layer_dim, loop in spatial_mapping.items(): + if layer_dim in [wl_dim, bl_dim]: # serve the dimension inside the macro + if isinstance(loop[0], str): # single layer_dim unrolling + spatial_mapping_in_macro.append(loop) + else: # mix layer_dim unrolling + for element in loop: + spatial_mapping_in_macro.append(element) + + # We will firstly derive how many number of PE columns and rows are mapping. + # Later, energy of unmapped rows and columns will be set to 0. + # We start from deriving the number of mapped columns in each macro. + # the sm loop would do not exist if did not find any + if wl_dim not in spatial_mapping.keys(): + mapped_cols = 1 # mapped number of wl dims + weight_ir_loop_on_wl_dim = False # if there is OX / OY mapped on wl dims + else: + sm_on_wl_dim = spatial_mapping[wl_dim] # spatial mapping on wl_dimension + if isinstance(sm_on_wl_dim[0], str): # single layer mapping (e.g. ("K", 2)) + mapped_cols = sm_on_wl_dim[1] # floating number is also supported for calculation + else: # mix layer_dim mapping (e.g. (("K",2), ("OX",2)) ) + mapped_cols = math.prod([v[1] for v in sm_on_wl_dim]) + # We then calculate the number of mapped rows in each macro. + # As there might be OX / OY unrolling, which results in a diagonal mapping, we will have a special check on that + # Firstly check if there is OX / OY unrolling + weight_ir_layer_dims: list = layer.operand_loop_dim[layer_const_operand]["ir"] + weight_ir_loop_on_wl_dim = False # set default value + if isinstance(sm_on_wl_dim[0], str): # single layer mapping (e.g. ("K", 2)) + weight_ir_loop_on_wl_dim = True if sm_on_wl_dim[0] in weight_ir_layer_dims else False + else: # mix layer_dim mapping (e.g. (("K",2), ("OX",2)) ) + for element in sm_on_wl_dim: + layer_dim = element[0] + if layer_dim in weight_ir_layer_dims: + weight_ir_loop_on_wl_dim = True + break + + # Calculate total mapped number of rows + if bl_dim in spatial_mapping.keys(): + sm_on_bl_dim = spatial_mapping[bl_dim] # spatial mapping on bl_dimension + if not weight_ir_loop_on_wl_dim: # if False: mean there is no OX / OY unrolling on wl_dim, so no diagonal unrolling required + if isinstance(sm_on_bl_dim[0], str): # single layer mapping (e.g. ("FX", 2)) + mapped_rows_total = sm_on_bl_dim[1] # floating number is also supported for calculation + else: # mix layer_dim mapping (e.g. (("C",2), ("FX",2)) ) + mapped_rows_total = math.prod([v[1] for v in sm_on_bl_dim]) + mapped_rows_total = math.ceil(mapped_rows_total) # must be an integer, as it is used for adder trees. + mapped_rows_for_adder = mapped_rows_total + else: + mapped_rows_total, mapped_rows_for_adder = DimcArray.calculate_mapped_rows_total_when_diagonal_mapping_found( + layer, + layer_const_operand, + layer_act_operand, + sm_on_wl_dim, + sm_on_bl_dim) + else: # there is no sm loop on bl_dim + mapped_rows_total = 1 + mapped_rows_for_adder = 1 + + # Get the number of time of activating macro + # Note: it is normalized to a hardware that has only one macro (see equation below) + # Equation = total MAC number of a layer/spatial mapping on a single macro + macro_activation_times = layer.total_MAC_count / np.prod([x[1] for x in spatial_mapping_in_macro]) + return mapped_rows_total, mapped_rows_for_adder, mapped_cols, macro_activation_times + + @staticmethod + def get_precharge_energy(hd_param, tech_param, layer, mapping): + # calculate pre-charging energy on local bitlines for specific layer and mapping + # also calculate mapped group depth (number of weights stored in a cell group) + group_depth = hd_param["group_depth"] + if group_depth > 1: + # Pre-charge operation is required on local bitline if group_depth > 1 + # The final pre-charge energy = energy/PE * nb_of_precharge_times + # nb_of_precharge_times is normalized to single PE. + + # activation/weight representation + layer_act_operand, layer_const_operand = DimcArray.identify_layer_operand_representation(layer) + + # Get the precharge interval between two precharge operations + precharge_interval = 1 # 1: precharge every cycle + tm_loops_in_cell_group: list = mapping.temporal_mapping.mapping_dic_origin[layer_const_operand][0] + # As loops close to the beginning will be executed firstly, we will count how many weight ir loops there are + # until we reach a weight r loop + weight_r_layer_dims: list = layer.operand_loop_dim[layer_const_operand]["r"] + weight_ir_layer_dims: list = layer.operand_loop_dim[layer_const_operand]["ir"] + for (loop_name, loop_size) in tm_loops_in_cell_group: + if loop_name in weight_ir_layer_dims: + precharge_interval *= loop_size + else: + break # break when we meet the first ir loop of weight + # Equation: nb_of_precharge_times = rd_out_to_low_count_of_lowest_weight_mem / precharge_intervals + nb_of_precharge_times = mapping.unit_mem_data_movement[layer_const_operand][0].data_elem_move_count.rd_out_to_low / precharge_interval + single_pe_precharge_energy = ((tech_param["wl_cap"] * (tech_param["vdd"] ** 2)) + \ + (tech_param["bl_cap"] * (tech_param["vdd"] ** 2) * group_depth)) * \ + (hd_param["weight_precision"]) + energy_precharging = single_pe_precharge_energy * nb_of_precharge_times + # Calculate mapped_group_depth + mapped_group_depth = 1 + for (loop_name, loop_size) in tm_loops_in_cell_group: + if loop_name in weight_r_layer_dims: + mapped_group_depth *= loop_size + else: + energy_precharging = 0 + mapped_group_depth = 1 + return energy_precharging, mapped_group_depth + + def get_mults_energy(self, hd_param, logic_unit, layer, mapped_rows_total, wl_dim_size, macro_activation_times) -> float: + """ + calculate energy spent on multipliers for specific layer and mapping + """ + # activation/weight representation + layer_act_operand, layer_const_operand = self.identify_layer_operand_representation(layer) + + layer_act_operand_pres = layer.operand_precision[layer_act_operand] + nb_of_mapped_mults_in_macro = hd_param["weight_precision"] * hd_param["input_bit_per_cycle"] * \ + mapped_rows_total * wl_dim_size + nb_of_activation_times = macro_activation_times * \ + (layer_act_operand_pres / hd_param["input_bit_per_cycle"]) + energy_mults = logic_unit.get_1b_multiplier_energy() * nb_of_mapped_mults_in_macro * nb_of_activation_times + return energy_mults + + def get_adder_trees_energy(self, layer, logic_unit, mapped_rows_for_adder, bl_dim_size, mapped_cols, layer_act_operand_pres, macro_activation_times): + """ + get the energy spent on RCA adder trees for specific layer and mapping + """ + # activation/weight representation + layer_act_operand, layer_const_operand = self.identify_layer_operand_representation(layer) + + layer_const_operand_pres = layer.operand_precision[layer_const_operand] + nb_inputs_of_adder = bl_dim_size # physical number of inputs in a single adder tree + adder_depth = math.log2(nb_inputs_of_adder) + assert nb_inputs_of_adder % 1 == 0, \ + f"The number of inputs for an adder tree [{nb_inputs_of_adder}] is not in the power of 2." + adder_depth = int(adder_depth) # float -> int for simplicity + mapped_inputs = mapped_rows_for_adder # number of used inputs for an adder tree + adder_input_pres = layer_const_operand_pres # input precision for a single adder tree + adder_output_pres = adder_input_pres + adder_depth + nb_of_1b_adder = nb_inputs_of_adder * (adder_input_pres + 1) - (adder_input_pres + adder_depth + 1) # nb of 1b adders in a single adder tree + + # In the adders' model, we classify the basic FA (1-b full adder) as two types: + # 1. fully activated FA: two of its inputs having data comes in. (higher energy cost) + # 2. half activated FA: only one of its inputs having data comes in. + # The 2nd type has lower energy cost, because no carry will be generated and the carry path stays unchanged. + # Below we figure out how many there are of fully activated FA and half activated FA + if mapped_inputs >= 1: + if mapped_inputs >= nb_inputs_of_adder: + """ + :param fully_activated_number_of_1b_adder: fully activated 1b adder, probably will produce a carry + :param half_activated_number_of_1b_adder: only 1 input is activate and the other port is 0, so carry path is activated. + """ + fully_activated_number_of_1b_adder = nb_of_1b_adder + half_activated_number_of_1b_adder = 0 + else: + """ + find out fully_activated_number_of_1b_adder and half_activated_number_of_1b_adder when inputs are not fully mapped. + method: iteratively check if left_input is bigger or smaller than baseline, which will /2 each time, until left_input == 1 + :param left_input: the number of inputs waiting for processing + :param baseline: serves as references for left_input + """ + fully_activated_number_of_1b_adder = 0 + half_activated_number_of_1b_adder = 0 + left_input = mapped_inputs + baseline = nb_inputs_of_adder + while left_input != 0: + baseline = baseline / 2 + activated_depth = int(math.log2(baseline)) + if left_input <= 1 and baseline == 1: # special case + fully_activated_number_of_1b_adder += 0 + half_activated_number_of_1b_adder += adder_input_pres + left_input = 0 + elif left_input > baseline: + fully_activated_number_of_1b_adder += baseline * (adder_input_pres + 1) - (adder_input_pres + activated_depth + 1) + (adder_input_pres + activated_depth) + half_activated_number_of_1b_adder += 0 + left_input = left_input - baseline + elif left_input < baseline: + half_activated_number_of_1b_adder += adder_input_pres + activated_depth + else: # left_input == baseline + fully_activated_number_of_1b_adder += baseline * (adder_input_pres + 1) - (adder_input_pres + activated_depth + 1) + half_activated_number_of_1b_adder += adder_input_pres + activated_depth + left_input = left_input - baseline + + single_adder_tree_energy = fully_activated_number_of_1b_adder * logic_unit.get_1b_adder_energy() + \ + half_activated_number_of_1b_adder * logic_unit.get_1b_adder_energy_half_activated() + nb_of_activation_times = mapped_cols * layer_act_operand_pres * macro_activation_times + energy_adders = single_adder_tree_energy * nb_of_activation_times + else: + energy_adders = 0 + return energy_adders, adder_output_pres + + def get_adder_pv_energy(self, nb_inputs_of_adder_pv, input_precision, logic_unit, layer_act_operand_pres, input_bit_per_cycle, mapped_cols, macro_activation_times): + """ + get the energy for adder tree with input having place value (pv) + """ + if nb_inputs_of_adder_pv == 1: + energy_adders_pv = 0 + else: + adder_pv_input_precision = input_precision + nb_of_1b_adder_pv = adder_pv_input_precision * (nb_inputs_of_adder_pv - 1) + nb_inputs_of_adder_pv * (math.log2(nb_inputs_of_adder_pv) - 0.5) + nb_of_activation_times = mapped_cols * layer_act_operand_pres / input_bit_per_cycle * macro_activation_times + energy_adders_pv = logic_unit.get_1b_adder_energy() * nb_of_1b_adder_pv * nb_of_activation_times + return energy_adders_pv + + def get_energy_for_a_layer(self, layer, mapping): + """ + get the imc array energy for specific layer with specific mapping + """ + """check if operand precision defined in the layer is the same with in hardware template""" + # activation/weight representation + layer_act_operand, layer_const_operand = self.identify_layer_operand_representation(layer) + + layer_const_operand_pres = layer.operand_precision[layer_const_operand] + layer_act_operand_pres = layer.operand_precision[layer_act_operand] + weight_pres_in_hd_param = self.hd_param["weight_precision"] + act_pres_in_hd_param = self.hd_param["input_precision"] + + # currently in the energy model, the input and weight precision defined in the workload file should be the same with in the hd input file. + # this check can be removed if variable precision is supported in the future. + assert layer_const_operand_pres == weight_pres_in_hd_param, \ + f"Weight precision defined in the workload [{layer_const_operand_pres}] not equal to the one defined in the hardware hd_param [{weight_pres_in_hd_param}]." + assert layer_act_operand_pres == act_pres_in_hd_param, \ + f"Activation precision defined in the workload [{layer_act_operand_pres}] not equal to the one defined in the hardware hd_param [{act_pres_in_hd_param}]." + + """parameter extraction""" + mapped_rows_total, mapped_rows_for_adder, mapped_cols, macro_activation_times = DimcArray.get_mapped_oa_dim(layer, self.wl_dim, self.bl_dim) + self.mapped_rows_total = mapped_rows_total + + """energy calculation""" + """energy of precharging""" + energy_precharging, mapped_group_depth = DimcArray.get_precharge_energy(self.hd_param, self.logic_unit.tech_param, layer, mapping) + self.mapped_group_depth = mapped_group_depth + + """energy of multiplier array""" + energy_mults = self.get_mults_energy(self.hd_param, self.logic_unit, layer, mapped_rows_total, self.wl_dim_size, macro_activation_times) + + """energy of adder trees (type: RCA)""" + energy_adders, adder_output_pres = self.get_adder_trees_energy(layer, self.logic_unit, mapped_rows_for_adder, + self.bl_dim_size, mapped_cols, layer_act_operand_pres, macro_activation_times) + + """energy of adders_pv (type: RCA)""" + nb_inputs_of_adder_pv = self.hd_param["input_bit_per_cycle"] + input_bit_per_cycle = self.hd_param["input_bit_per_cycle"] + energy_adders_pv = self.get_adder_pv_energy(nb_inputs_of_adder_pv, adder_output_pres, self.logic_unit, layer_act_operand_pres, + input_bit_per_cycle, mapped_cols, macro_activation_times) + + """energy of accumulators (adder type: RCA)""" + if input_bit_per_cycle == layer_act_operand_pres: + energy_accumulators = 0 + else: + accumulator_output_pres = self.hd_param["input_precision"]+self.hd_param["weight_precision"]+math.log2(self.bl_dim_size) + nb_of_activation_times = mapped_cols * layer_act_operand_pres / input_bit_per_cycle * macro_activation_times + energy_accumulators = (self.logic_unit.get_1b_adder_energy() + self.logic_unit.get_1b_reg_energy()) * \ + accumulator_output_pres * nb_of_activation_times + + self.energy_breakdown = { # unit: pJ (the unit borrowed from CACTI) + "precharging": energy_precharging, + "mults": energy_mults, + "adders": energy_adders, + "adders_pv": energy_adders_pv, + "accumulators": energy_accumulators + } + self.energy = sum([v for v in self.energy_breakdown.values()]) + return self.energy_breakdown + + @staticmethod + def identify_layer_operand_representation(layer): + # activation representation: list (conv layers) + act_operand = [operand for operand in layer.operand_loop_dim.keys() if + len(layer.operand_loop_dim[operand]["pr"]) > 0] + if len(act_operand) == 0: # true for fully-connected (fc) layers + # weight representation (fc layers) + const_operand = [operand for operand in layer.operand_loop_dim.keys() if + len(layer.operand_loop_dim[operand]["ir"]) == 0][0] + # activation representation (fc layers) + act_operand = [operand for operand in layer.input_operands if operand != const_operand][0] + else: + act_operand = act_operand[0] + # weight representation (conv layers) + const_operand = [operand for operand in layer.input_operands if operand != act_operand][0] + return act_operand, const_operand + +if __name__ == "__main__": +# +##### IMC hardware dimension illustration (keypoint: adders' accumulation happens on D2) +# +# |<------------------------ D1 ----------------------------->| (nb_of_columns/macro = D1 * weight_precision) +# - +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ \ +# ^ + + + D3 (nb_of_macros) +# | + ^ +++++++ + + \ +# | + | + W + + + +# | + group_depth +++++++ + + +# | + | + W + + + +# | + v +++++++ + + +# | + | + + +# | + v + + +# | + multipliers -\ + + +# | + . \ + + +# + . - adders (DIMC) + + +# D2 + . / OR adcs (AIMC) + + +# + multipliers -/ | + + +# | + ^ | + + +# | + | | + + +# | + ^ +++++++ v + + +# | + | + W + adders_pv (place value) + + +# | + group_depth +++++++ | + + +# | + | + W + v + + +# | + v +++++++ accumulators + + +# | + | + + +# v + | + + +# - +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +# + | + + +# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# (nb_of_rows/macro = D2 * group_depth) | +# v +# outputs +# + + tech_param_28nm = { + "tech_node":0.028, # unit: um + "vdd": 0.9, # unit: V + "nd2_cap": 0.7/1e3, # unit: pF + "xor2_cap": 0.7*1.5/1e3, # unit: pF + "dff_cap": 0.7*3/1e3, # unit: pF + "nd2_area": 0.614/1e6, # unit: mm^2 + "xor2_area":0.614*2.4/1e6, # unit: mm^2 + "dff_area": 0.614*6/1e6, # unit: mm^2 + "nd2_dly": 0.0478, # unit: ns + "xor2_dly": 0.0478*2.4, # unit: ns + # "dff_dly": 0.0478*3.4, # unit: ns + } + dimensions = { + "D1": 32/8, # wordline dimension + "D2": 32, # bitline dimension + "D3": 1, # nb_macros + } # {"D1": ("K", 4), "D2": ("C", 32),} + + """hd_param example for DIMC""" + hd_param = { + "pe_type": "in_sram_computing", # required for CostModelStage + "imc_type": "digital", # "digital" or "analog". Or else: pure digital + "input_precision": 8, # activation precison + "weight_precision": 8, # weight precision + "input_bit_per_cycle": 1, # nb_bits of input/cycle + "group_depth": 1, # m factor + "wordline_dimension": "D1", # wordline dimension + # hardware dimension where input reuse happens (corresponds to the served dimension of input regs) + "bitline_dimension": "D2", # bitline dimension + # hardware dimension where accumulation happens (corresponds to the served dimension of output regs) + "enable_cacti": True, # use CACTI to estimated cell array area cost (cell array exclude build-in logic part) + } + dimc = DimcArray(tech_param_28nm, hd_param, dimensions) + dimc.get_area() + dimc.get_delay() + logger = _logging.getLogger(__name__) + logger.info(f"Total IMC area (mm^2): {dimc.area}") + logger.info(f"area breakdown: {dimc.area_breakdown}") + logger.info(f"delay (ns): {dimc.delay}") + logger.info(f"delay breakdown (ns): {dimc.delay_breakdown}") + dimc.get_macro_level_peak_performance() + exit() diff --git a/zigzag/classes/hardware/architecture/ImcArray.py b/zigzag/classes/hardware/architecture/ImcArray.py new file mode 100644 index 00000000..3c13d502 --- /dev/null +++ b/zigzag/classes/hardware/architecture/ImcArray.py @@ -0,0 +1,41 @@ +import numpy as np +from typing import Dict +if __name__ == "__main__": + from dimension import Dimension + from DimcArray import DimcArray + from AimcArray import AimcArray + from operational_array import OperationalArray +else: + from zigzag.classes.hardware.architecture.dimension import Dimension + from zigzag.classes.hardware.architecture.DimcArray import DimcArray + from zigzag.classes.hardware.architecture.AimcArray import AimcArray + from zigzag.classes.hardware.architecture.operational_array import OperationalArray + + +class ImcArray(OperationalArray): + def __init__(self, tech_param: Dict[str, float], hd_param: dict, dimensions: Dict[str, int]): + # This class defines the general IMC array (including AIMC and DIMC) + # @param tech_param: definition of technology-related parameters + # @param hd_param: hardware architecture parameters except dimensions + # @param dimensions: dimensions definition + if hd_param["imc_type"] == "digital": + super().__init__(operational_unit=DimcArray(tech_param, hd_param, dimensions), + dimensions=dimensions) + elif hd_param["imc_type"] == "analog": + super().__init__(operational_unit=AimcArray(tech_param, hd_param, dimensions), + dimensions=dimensions) + + self.unit.get_area() # update self.area and self.area_breakdown + self.unit.get_delay() # update self.delay and self.delay_breakdown + self.area_breakdown = self.unit.area_breakdown + self.total_area = self.unit.area + self.tclk_breakdown = self.unit.delay_breakdown # clock period breakdown + self.tclk = self.unit.delay # maximum clock period (unit: ns) + self.pe_type = hd_param["pe_type"] + self.imc_type = hd_param["imc_type"] + self.tops_peak, self.topsw_peak, self.topsmm2_peak = self.unit.get_macro_level_peak_performance() + + def __jsonrepr__(self): + # JSON Representation of this class to save it to a json file. + return {"operational_unit": self.unit, "dimensions": self.dimensions} + diff --git a/zigzag/classes/hardware/architecture/get_cacti_cost.py b/zigzag/classes/hardware/architecture/get_cacti_cost.py new file mode 100644 index 00000000..571a269e --- /dev/null +++ b/zigzag/classes/hardware/architecture/get_cacti_cost.py @@ -0,0 +1,556 @@ +import os +import platform + +class CactiConfig: + + def __init__(self): + # content = f.readlines() + self.baseline_config = ['# power gating\n', + '-Array Power Gating - "false"\n', + '-WL Power Gating - "false"\n', + '-CL Power Gating - "false"\n', + '-Bitline floating - "false"\n', + '-Interconnect Power Gating - "false"\n', + '-Power Gating Performance Loss 0.01\n', + '\n', + '# following three parameters are meaningful only for main memories\n', + '-page size (bits) 8192 \n', + '-burst length 8\n', + '-internal prefetch width 8\n', + '\n', + '# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)\n', + '-Data array cell type - "itrs-hp"\n', + '//-Data array cell type - "itrs-lstp"\n', + '//-Data array cell type - "itrs-lop"\n', + '\n', + '# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)\n', + '-Data array peripheral type - "itrs-hp"\n', + '//-Data array peripheral type - "itrs-lstp"\n', + '//-Data array peripheral type - "itrs-lop"\n', + '\n', + '# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)\n', + '-Tag array cell type - "itrs-hp"\n', + '//-Tag array cell type - "itrs-lstp"\n', + '//-Tag array cell type - "itrs-lop"\n', + '\n', + '# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)\n', + '-Tag array peripheral type - "itrs-hp"\n', + '//-Tag array peripheral type - "itrs-lstp"\n', + '//-Tag array peripheral type - "itrs-lop\n', + '\n', + '\n', + '// 300-400 in steps of 10\n', + '-operating temperature (K) 360\n', + '\n', + '# to model special structure like branch target buffers, directory, etc. \n', + '# change the tag size parameter\n', + '# if you want cacti to calculate the tagbits, set the tag size to "default"\n', + '-tag size (b) "default"\n', + '//-tag size (b) 22\n', + '\n', + '# fast - data and tag access happen in parallel\n', + '# sequential - data array is accessed after accessing the tag array\n', + '# normal - data array lookup and tag access happen in parallel\n', + '# final data block is broadcasted in data array h-tree \n', + '# after getting the signal from the tag array\n', + '//-access mode (normal, sequential, fast) - "fast"\n', + '-access mode (normal, sequential, fast) - "normal"\n', + '//-access mode (normal, sequential, fast) - "sequential"\n', + '\n', + '\n', + '# DESIGN OBJECTIVE for UCA (or banks in NUCA)\n', + '-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0\n', + '\n', + '# Percentage deviation from the minimum value \n', + '# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization\n', + '# that compromises at most 10% delay. \n', + '# NOTE: Try reasonable values for % deviation. Inconsistent deviation\n', + '# percentage values will not produce any valid organizations. For example,\n', + '# 0:0:100:100:100 will try to identify an organization that has both\n', + '# least delay and dynamic power. Since such an organization is not possible, CACTI will\n', + '# throw an error. Refer CACTI-6 Technical report for more details\n', + '-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000\n', + '\n', + '# Objective for NUCA\n', + '-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100\n', + '-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000\n', + '\n', + '# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for\n', + '# energy-delay or energy-delay sq. product\n', + '# Note: Optimize tag will disable weight or deviate values mentioned above\n', + '# Set it to NONE to let weight and deviate values determine the \n', + '# appropriate cache configuration\n', + '//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"\n', + '-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"\n', + '//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"\n', + '\n', + '-Cache model (NUCA, UCA) - "UCA"\n', + '//-Cache model (NUCA, UCA) - "NUCA"\n', + '\n', + '# In order for CACTI to find the optimal NUCA bank value the following\n', + '# variable should be assigned 0.\n', + '-NUCA bank count 0\n', + '\n', + '# NOTE: for nuca network frequency is set to a default value of \n', + '# 5GHz in time.c. CACTI automatically\n', + '# calculates the maximum possible frequency and downgrades this value if necessary\n', + '\n', + '# By default CACTI considers both full-swing and low-swing \n', + '# wires to find an optimal configuration. However, it is possible to \n', + '# restrict the search space by changing the signaling from "default" to \n', + '# "fullswing" or "lowswing" type.\n', + '-Wire signaling (fullswing, lowswing, default) - "Global_30"\n', + '//-Wire signaling (fullswing, lowswing, default) - "default"\n', + '//-Wire signaling (fullswing, lowswing, default) - "lowswing"\n', + '\n', + '//-Wire inside mat - "global"\n', + '-Wire inside mat - "semi-global"\n', + '//-Wire outside mat - "global"\n', + '-Wire outside mat - "semi-global"\n', + '\n', + '-Interconnect projection - "conservative"\n', + '//-Interconnect projection - "aggressive"\n', + '\n', + '# Contention in network (which is a function of core count and cache level) is one of\n', + '# the critical factor used for deciding the optimal bank count value\n', + '# core count can be 4, 8, or 16\n', + '//-Core count 4\n', + '-Core count 8\n', + '//-Core count 16\n', + '-Cache level (L2/L3) - "L3"\n', + '\n', + '-Add ECC - "true"\n', + '\n', + '//-Print level (DETAILED, CONCISE) - "CONCISE"\n', + '-Print level (DETAILED, CONCISE) - "DETAILED"\n', + '\n', + '# for debugging\n', + '-Print input parameters - "true"\n', + '//-Print input parameters - "false"\n', + '# force CACTI to model the cache with the \n', + '# following Ndbl, Ndwl, Nspd, Ndsam,\n', + '# and Ndcm values\n', + '//-Force cache config - "true"\n', + '-Force cache config - "false"\n', + '-Ndwl 1\n', + '-Ndbl 1\n', + '-Nspd 0\n', + '-Ndcm 1\n', + '-Ndsam1 0\n', + '-Ndsam2 0\n', + '\n', + '\n', + '\n', + '#### Default CONFIGURATION values for baseline external IO parameters to DRAM. More details can be found in the CACTI-IO technical report (), especially Chapters 2 and 3.\n', + '\n', + '# Memory Type (D3=DDR3, D4=DDR4, L=LPDDR2, W=WideIO, S=Serial). Additional memory types can be defined by the user in extio_technology.cc, along with their technology and configuration parameters.\n', + '\n', + '-dram_type "DDR3"\n', + '//-dram_type "DDR4"\n', + '//-dram_type "LPDDR2"\n', + '//-dram_type "WideIO"\n', + '//-dram_type "Serial"\n', + '\n', + '# Memory State (R=Read, W=Write, I=Idle or S=Sleep) \n', + '\n', + '//-io state "READ"\n', + '-io state "WRITE"\n', + '//-io state "IDLE"\n', + '//-io state "SLEEP"\n', + '\n', + '#Address bus timing. To alleviate the timing on the command and address bus due to high loading (shared across all memories on the channel), the interface allows for multi-cycle timing options. \n', + '\n', + '//-addr_timing 0.5 //DDR\n', + '-addr_timing 1.0 //SDR (half of DQ rate)\n', + '//-addr_timing 2.0 //2T timing (One fourth of DQ rate)\n', + '//-addr_timing 3.0 // 3T timing (One sixth of DQ rate)\n', + '\n', + '# Memory Density (Gbit per memory/DRAM die)\n', + '\n', + '-mem_density 4 Gb //Valid values 2^n Gb\n', + '\n', + '# IO frequency (MHz) (frequency of the external memory interface).\n', + '\n', + '-bus_freq 800 MHz //As of current memory standards (2013), valid range 0 to 1.5 GHz for DDR3, 0 to 533 MHz for LPDDR2, 0 - 800 MHz for WideIO and 0 - 3 GHz for Low-swing differential. However this can change, and the user is free to define valid ranges based on new memory types or extending beyond existing standards for existing dram types.\n', + '\n', + '# Duty Cycle (fraction of time in the Memory State defined above)\n', + '\n', + '-duty_cycle 1.0 //Valid range 0 to 1.0\n', + '\n', + '# Activity factor for Data (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)\n', + ' \n', + '-activity_dq 1.0 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR\n', + '\n', + '# Activity factor for Control/Address (0->1 transitions) per cycle (for DDR, need to account for the higher activity in this parameter. E.g. max. activity factor for DDR is 1.0, for SDR is 0.5)\n', + '\n', + '-activity_ca 0.5 //Valid range 0 to 1.0 for DDR, 0 to 0.5 for SDR, 0 to 0.25 for 2T, and 0 to 0.17 for 3T\n', + '\n', + '# Number of DQ pins \n', + '\n', + '-num_dq 72 //Number of DQ pins. Includes ECC pins.\n', + '\n', + '# Number of DQS pins. DQS is a data strobe that is sent along with a small number of data-lanes so the source synchronous timing is local to these DQ bits. Typically, 1 DQS per byte (8 DQ bits) is used. The DQS is also typucally differential, just like the CLK pin. \n', + '\n', + '-num_dqs 18 //2 x differential pairs. Include ECC pins as well. Valid range 0 to 18. For x4 memories, could have 36 DQS pins.\n', + '\n', + '# Number of CA pins \n', + '\n', + '-num_ca 25 //Valid range 0 to 35 pins.\n', + '\n', + '# Number of CLK pins. CLK is typically a differential pair. In some cases additional CLK pairs may be used to limit the loading on the CLK pin. \n', + '\n', + '-num_clk 2 //2 x differential pair. Valid values: 0/2/4.\n', + '\n', + '# Number of Physical Ranks\n', + '\n', + '-num_mem_dq 2 //Number of ranks (loads on DQ and DQS) per buffer/register. If multiple LRDIMMs or buffer chips exist, the analysis for capacity and power is reported per buffer/register. \n', + '\n', + '# Width of the Memory Data Bus\n', + '\n', + '-mem_data_width 8 //x4 or x8 or x16 or x32 memories. For WideIO upto x128.\n', + '\n', + '# RTT Termination Resistance\n', + '\n', + '-rtt_value 10000\n', + '\n', + '# RON Termination Resistance\n', + '\n', + '-ron_value 34\n', + '\n', + '# Time of flight for DQ\n', + '\n', + '-tflight_value\n', + '\n', + '# Parameter related to MemCAD\n', + '\n', + '# Number of BoBs: 1,2,3,4,5,6,\n', + '-num_bobs 1\n', + '\t\n', + '# Memory System Capacity in GB\n', + '-capacity 80\t\n', + '\t\n', + '# Number of Channel per BoB: 1,2. \n', + '-num_channels_per_bob 1\t\n', + '\n', + '# First Metric for ordering different design points\t\n', + '-first metric "Cost"\n', + '#-first metric "Bandwidth"\n', + '#-first metric "Energy"\n', + '\t\n', + '# Second Metric for ordering different design points\t\n', + '#-second metric "Cost"\n', + '-second metric "Bandwidth"\n', + '#-second metric "Energy"\n', + '\n', + '# Third Metric for ordering different design points\t\n', + '#-third metric "Cost"\n', + '#-third metric "Bandwidth"\n', + '-third metric "Energy"\t\n', + '\t\n', + '\t\n', + '# Possible DIMM option to consider\n', + '#-DIMM model "JUST_UDIMM"\n', + '#-DIMM model "JUST_RDIMM"\n', + '#-DIMM model "JUST_LRDIMM"\n', + '-DIMM model "ALL"\n', + '\n', + '#if channels of each bob have the same configurations\n', + '#-mirror_in_bob "T"\n', + '-mirror_in_bob "F"\n', + '\n', + '#if we want to see all channels/bobs/memory configurations explored\t\n', + '#-verbose "T"\n', + '#-verbose "F"\n', + '\n', + '=======USER DEFINE======= \n'] + + self.config_options = {} + ''' entire memory size (unit: Byte, range: >= 64)''' + self.config_options['cache_size'] = {'string': '-size (bytes) ', + 'option': [64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, + 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, + 8388608, 16777216, 33554432, 134217728, 67108864, + 1073741824], + 'default': 64} + + ''' number of bytes on a single row (constraint: bitwidth >= IO_bus_width)''' + self.config_options['line_size'] = {'string': '-block size (bytes) ', + 'option': [8, 16, 24], + 'default': 64} + + ''' IO bus width (unit: bit). Minimum: 4 (smaller than 4 will results in generation fail) ''' + self.config_options['IO_bus_width'] = {'string': '-output/input bus width ', + 'option': [4, 8, 16, 24, 32, 64, 128], + 'default': 64} + + self.config_options['associativity'] = {'string': '-associativity ', + 'option': [0, 1, 2, 4], + 'default': 1} + + ''' number of wr port ''' + self.config_options['rd_wr_port'] = {'string': '-read-write port ', + 'option': [0, 1, 2, 3, 4], + 'default': 1} + + ''' number of exclusive read port ''' + self.config_options['ex_rd_port'] = {'string': '-exclusive read port ', + 'option': [0, 1, 2, 3, 4], + 'default': 0} + + ''' number of exclusive write port ''' + self.config_options['ex_wr_port'] = {'string': '-exclusive write port ', + 'option': [0, 1, 2, 3, 4], + 'default': 0} + + self.config_options['single_rd_port'] = {'string': '-single ended read ports ', + 'option': [0, 1, 2, 3, 4], + 'default': 0} + + ''' number of bank ''' + self.config_options['bank_count'] = {'string': '-UCA bank count ', + 'option': [1, 2, 4, 8, 16], + 'default': 1} + + ''' technology node ''' + self.config_options['technology'] = {'string': '-technology (u) ', + 'option': [0.022, 0.028, 0.040, 0.032, 0.065, 0.090], + 'default': 0.065} + ''' memory type ''' + self.config_options['mem_type'] = {'string': '-cache type ', + 'option': ['"cache"', '"ram"', '"main memory"'], + 'default': '"ram"'} + + ''' working temperature (unit: K, Temperature must be between 300 and 400 Kelvin and multiple of 10) ''' + self.config_options['temperature'] = {'string': '-operating temperature (K) ', + 'option': [300, 310, 320, 330], + 'default': 300} + + return + + def change_default_value(self, name_list, new_value_list): + for idx, name in enumerate(name_list): + self.config_options[name]['default'] = new_value_list[idx] + + def write_config(self, user_config, path): + f = open(path, "w+") + f.write(''.join(self.baseline_config)) + f.write(''.join(user_config)) + f.close() + + def call_cacti(self, path): + # os.system('./cacti -infile ./self_gen/cache.cfg') + # print('##########################################################################################') + # stream = os.popen('./cacti -infile %s' %path) + stream = os.popen('./cacti -infile %s &> /dev/null' %path) + #stream = os.popen('./cacti -infile %s' %path) + output = stream.readlines() + for l in output: + print(l, end = '') + return output + + def cacti_auto(self, user_input, path): + ''' + user_input format can be 1 out of these 3: + user_input = ['default'] + user_input = ['single', [['mem_type', 'technology', ...], ['"ram"', 0.028, ...]] + user_input = ['sweep', ['IO_bus_width'/'']] + ''' + + user_config = [] + ''' use default value for each parameter ''' + if user_input[0] == 'default': + for itm in self.config_options.keys(): + user_config.append(self.config_options[itm]['string'] + str(self.config_options[itm]['default']) + '\n') + self.write_config(user_config, path) + self.call_cacti(path) + + ''' use user defined value for each user defined parameter ''' + if user_input[0] == 'single': + for itm in self.config_options.keys(): + if itm in user_input[1][0]: + ii = user_input[1][0].index(itm) + user_config.append(self.config_options[itm]['string'] + str(user_input[1][1][ii]) + '\n') + else: + user_config.append(self.config_options[itm]['string'] + str(self.config_options[itm]['default']) + '\n') + self.write_config(user_config, path) + self.call_cacti(path) + + if user_input[0] == 'sweep': + # produce non-sweeping term + common_part = [] + for itm in self.config_options.keys(): + if itm not in user_input[1]: + common_part.append(self.config_options[itm]['string'] + str(self.config_options[itm]['default']) + '\n') + + for itm in user_input[1]: + for va in self.config_options[itm]['option']: + user_config.append([self.config_options[itm]['string'] + str(va) + '\n']) + + for ii in range(len(user_config)): + user_config[ii] += common_part + + for ii in range(len(user_config)): + self.write_config(user_config[ii], path) + self.call_cacti(path) + +def get_cacti_cost(cacti_path, tech_node, mem_type, mem_size_in_byte, bw, hd_hash="a"): + ''' + extract time, area, r_energy, w_energy cost from cacti 7.0 + :param cacti_path: the location of cacti + :param tech_node: technology node (directly supported node by CACTI: 0.022, 0.032, 0.045, 0.065, 0.09, 0.18) + :param mem_type: memory type (sram or dram) + :param mem_size_in_byte: memory size (unit: byte) + :param bw: memory IO bitwidth + :param hd_hash: input file suffix when generating CACTI input file (useful and in avoid of file conflict for multi-processing simulation) + Attention: for CACTI, the miminum mem_size=64B, minimum_rows=32 + ''' + import logging as _logging + _logging_level = _logging.CRITICAL + _logging_format = '%(asctime)s - %(funcName)s +%(lineno)s - %(levelname)s - %(message)s' + _logging.basicConfig(level=_logging_level, format=_logging_format) + + # get current system (linux or windows) + system = platform.system() # "Linux" or "Windows" + + # get the current working directory + cwd = os.getcwd() + + # change the working directory + os.chdir(cacti_path) + + # input parameters definition + if tech_node == 0.028: + tech = 0.032 # technology: 32 nm (corresponding VDD = 0.9) + scaling_factor = 0.9*0.9 + else: + tech = tech_node + scaling_factor = 1 + if mem_type == 'dram': + mem = '"main memory"' + elif mem_type == 'sram': + mem = '"ram"' + else: + msg = f'mem_type can only be dram or sram. Now it is: {mem_type}' + raise ValueError(msg) + + """ + due to the growth of the area cost estimation from CACTI exceeds 1x when bw > 32, it will be set to 1x. + """ + # check if bw > 32 + if bw > 32: # adjust the setting for CACTI + rows = mem_size_in_byte * 8/bw + line_size = int(32/8) + IO_bus_width = 32 + mem_size_in_byte_adjust = rows * 32 / 8 + else: # normal case + rows = mem_size_in_byte * 8/bw + line_size = int(bw/8) # how many bytes on a row + IO_bus_width = bw + mem_size_in_byte_adjust = mem_size_in_byte + + file_path = './self_gen' # location for input file (cache.cfg) and output file (cache.cfg.out) + os.makedirs(file_path, exist_ok=True) + + # clear target folder + # if system == 'Linux': + # os.system(f'rm -f {file_path}/cache_{hd_hash}.cfg.out') + # elif system == 'Windows': + # os.system(f'del {file_path}/cache_{hd_hash}.cfg.out') + # else: + # # user-defined command + # breakpoint() + + C = CactiConfig() + C.cacti_auto(['single', [['technology', 'cache_size', 'line_size', 'IO_bus_width', 'mem_type'], [tech, mem_size_in_byte_adjust, line_size, IO_bus_width, mem]]], f"{file_path}/cache_{hd_hash}.cfg") + # read out result + try: + f = open(f'{file_path}/cache_{hd_hash}.cfg.out', 'r') + except: + msg = f'CACTI failed. [current setting] rows: {rows}, bw: {bw}, mem size (byte): {mem_size_in_byte}' + _logging.critical(msg) + msg = f'[CACTI minimal requirement] rows: >= 32, bw: >= 8, mem size (byte): >=64' + _logging.critical(msg) + exit() + result = {} + raw_result = f.readlines() + f.close() + for ii, each_line in enumerate(raw_result): + if ii == 0: + attribute_list = each_line.split(',') + for each_attribute in attribute_list: + result[each_attribute] = [] + else: + for jj, each_value in enumerate(each_line.split(',')): + try: + result[attribute_list[jj]].append(float(each_value)) + except: + pass + # get required cost + try: + access_time = scaling_factor*float(result[' Access time (ns)'][-1]) # unit: ns + if bw > 32: + area = scaling_factor*float(result[' Area (mm2)'][-1]) * 2 * bw/32 # unit: mm2 + r_cost = scaling_factor*float(result[' Dynamic read energy (nJ)'][-1]) * bw/32 # unit: nJ + w_cost = scaling_factor*float(result[' Dynamic write energy (nJ)'][-1]) * bw/32 # unit: nJ + else: + area = scaling_factor*float(result[' Area (mm2)'][-1]) * 2 # unit: mm2 + r_cost = scaling_factor*float(result[' Dynamic read energy (nJ)'][-1]) # unit: nJ + w_cost = scaling_factor*float(result[' Dynamic write energy (nJ)'][-1]) # unit: nJ + except KeyError: + _logging.critical(f'**KeyError** in result, current result: {result}') + breakpoint() + + + # clear generated files + # if system == 'Linux': + # os.system(f'rm {file_path}/cache_{hd_hash}.cfg.out') # remove output file + # os.system(f'rm {file_path}/cache_{hd_hash}.cfg') # remove input file + # elif system == 'Windows': + # os.system(f'del {file_path}/cache_{hd_hash}.cfg.out') # remove output file + # os.system(f'del {file_path}/cache_{hd_hash}.cfg') # remove input file + # else: + # # user-defined command + # breakpoint() + + # change back the working directory + os.chdir(cwd) + + # round the value to avoid too long data representation + area = round(area, 7) # keep 3 valid digits + r_cost *= 1000 # unit: pJ/access + w_cost *= 1000 # unit: pJ/access + + return access_time, area, r_cost, w_cost + +def get_w_cost_per_weight_from_cacti(cacti_path, tech_param, hd_param, dimensions): + # Get w_cost for imc cell group + # Used in user-provided hardware input file, when it is needed. + # cacti_path = "zigzag/classes/cacti/cacti_master" + tech_node = tech_param["tech_node"] + wl_dim = hd_param["wordline_dimension"] + bl_dim = hd_param["bitline_dimension"] + wl_dim_size = dimensions[wl_dim] + bl_dim_size = dimensions[bl_dim] + group_depth = hd_param["group_depth"] + w_pres = hd_param["weight_precision"] + cell_array_size = wl_dim_size * bl_dim_size * group_depth * w_pres / 8 # array size. unit: byte + array_bw = wl_dim_size * w_pres # imc array bandwidth. unit: bit + + # we will call cacti to get the area (mm^2), access_time (ns), r_cost (nJ/access), w_cost (nJ/access) + access_time, area, r_cost, w_cost = get_cacti_cost(cacti_path=cacti_path, + tech_node=tech_node, + mem_type="sram", + mem_size_in_byte=cell_array_size, + bw=array_bw) + w_cost_per_weight_writing = w_cost * w_pres / array_bw # pJ/weight + w_cost_per_weight_writing = round(w_cost_per_weight_writing, 3) # keep 3 valid digits + return w_cost_per_weight_writing # unit: pJ/weight + +if __name__ == '__main__': + # an example for use (28nm, mem size: 32rows * 32 cols, bw: 32 bit) + for bw in [32]: + mem_size = 32*32/8 # byte + rows = mem_size*8/bw + access_time, area, r_cost, w_cost = get_cacti_cost(cacti_path = '../../cacti/cacti_master', tech_node = 0.028, mem_type = 'sram', mem_size_in_byte = mem_size, bw = bw) + print(f'access time (ns): {access_time}, area (mm2): {area}, r_cost (pJ)/bit: {r_cost*1000/bw}, w_cost (pJ)/bit: {w_cost*1000/bw}') + exit() diff --git a/zigzag/classes/hardware/architecture/imc_unit.py b/zigzag/classes/hardware/architecture/imc_unit.py new file mode 100644 index 00000000..91ad7294 --- /dev/null +++ b/zigzag/classes/hardware/architecture/imc_unit.py @@ -0,0 +1,197 @@ +import math +if __name__ == "__main__" or __name__ == "imc_unit": + # branch when the script is run locally or called by A/DimcArray.py + from get_cacti_cost import get_cacti_cost +else: + from zigzag.classes.hardware.architecture.get_cacti_cost import get_cacti_cost + +############################################################################################################### +# This file includes: +# . class LogicUnit (defines the energy/area/delay cost of multipliers, adders, regs) +# . class ImcArray (provides initialization function, used for class DimcArray and AimcArray) +############################################################################################################### + +class LogicUnit: + """cost (energy, area, delay) of 1b adder, 1b multiplier, 1b register is defined in this class""" + def __init__(self, tech_param:dict): + """ + Input example: + tech_param_28nm = { + "vdd": 0.9, # unit: V + "nd2_cap": 0.7/1e3, # unit: pF + "nd2_area": 0.614/1e6, # unit: mm^2 + "nd2_dly": 0.0478, # unit: ns + "xor2_cap": 0.7*1.5/1e3, # unit: pF + "xor2_area":0.614*2.4/1e6, # unit: mm^2 + "xor2_dly": 0.0478*1.5, # unit: ns + "dff_cap": 0.7*3/1e3, # unit: pF + "dff_area": 0.0614*6/1e6, # unit: mm^2 + "dff_dly": 0.0478*3.4, # unit: ns + } + """ + """check input firstly""" + self.check_tech_param(tech_param) + """initialization""" + self.tech_param = tech_param + self.tech_param["wl_cap"] = tech_param["nd2_cap"]/2 # wordline cap of each SRAM cell is treated as NAND2_cap/2 + self.tech_param["bl_cap"] = tech_param["nd2_cap"]/2 # bitline cap of each SRAM cell is treated as NAND2_cap/2 + + def check_tech_param(self, tech_param): + required_param = ["tech_node", "vdd", "nd2_cap", "nd2_area", "nd2_dly", "xor2_cap", "xor2_area", "xor2_dly", "dff_cap", "dff_area"] + for ii_a, a in enumerate(required_param): + if a not in tech_param.keys(): + raise Exception(f"[LogicUnit] Incorrect input, required param [{a}] not found.") + if not (isinstance(tech_param[a], int) or isinstance(tech_param[a], float)): + raise Exception(f"[LogicUnit] Incorrect input, value [{tech_param[a]}] of param [{a}] is not a num.") + if tech_param[a] <= 0: + raise Exception(f"[LogicUnit] Incorrect input, value [{tech_param[a]}] of param [{a}] is not positive.") + + def get_1b_adder_energy(self): + """energy of 1b full adder""" + """Assume a 1b adder has 3 ND2 gate and 2 XOR2 gate""" + adder_cap = 3 * self.tech_param["nd2_cap"] + 2 * self.tech_param["xor2_cap"] + return adder_cap * (self.tech_param["vdd"]**2) # unit: pJ + + def get_1b_adder_energy_half_activated(self): + """energy of 1b full adder when 1 input is 0""" + adder_cap = 2 * self.tech_param["xor2_cap"] + return adder_cap * (self.tech_param["vdd"] ** 2) # unit: pJ + + def get_1b_multiplier_energy(self): + """energy of 1b multiplier""" + """1b mult includes 1 NOR gate, which is assumed as the same cost of ND2 gate""" + """why 0.5: considering weight stays constant during multiplication""" + return 0.5 * self.tech_param["nd2_cap"] * (self.tech_param["vdd"] ** 2) # unit: pJ + + def get_1b_reg_energy(self): + """energy of 1b DFF""" + return self.tech_param["dff_cap"] * (self.tech_param["vdd"] ** 2) # unit: pJ + + def get_1b_adder_area(self): + """area of 1b full adder""" + """Assume a 1b adder has 3 ND2 gate and 2 XOR2 gate""" + adder_area = 3 * self.tech_param["nd2_area"] + 2 * self.tech_param["xor2_area"] + return adder_area + + def get_1b_multiplier_area(self): + """area of 1b multiplier""" + """1b mult includes 1 NOR gate, which is assumed as the same cost of ND2 gate""" + return self.tech_param["nd2_area"] + + def get_1b_reg_area(self): + """area of 1b DFF""" + return self.tech_param["dff_area"] + + def get_1b_adder_dly_in2sum(self): + """delay of 1b adder: input to sum-out""" + adder_dly = 2 * self.tech_param["xor2_dly"] + return adder_dly + + def get_1b_adder_dly_in2cout(self): + """delay of 1b adder: input to carry-out""" + adder_dly = self.tech_param["xor2_dly"] + 2 * self.tech_param["nd2_dly"] + return adder_dly + + def get_1b_adder_dly_cin2cout(self): + """delay of 1b adder: carry-in to carry-out""" + adder_dly = 2 * self.tech_param["nd2_dly"] + return adder_dly + + def get_1b_multiplier_dly(self): + """delay of 1b multiplier""" + """1b mult includes 1 NOR gate, which is assumed as the same cost of ND2 gate""" + return self.tech_param["nd2_dly"] + + def get_1b_reg_dly(self): + """delay of 1b DFF""" + """why 0? Compared to others, it's negligible""" + return 0 + +class ImcUnit: + """definition of general initilization function for D/AIMC""" + def __init__(self,tech_param:dict, hd_param:dict, dimensions:dict): + """check input firstly""" + self.check_input(hd_param, dimensions) + """initialization""" + self.hd_param = hd_param + self.dimensions = dimensions + self.wl_dim = hd_param["wordline_dimension"] # wl_dim should be the same with the dimension served by input_reg. + self.bl_dim = hd_param["bitline_dimension"] # bl_dim should be the same with the dimension served by output_reg. + self.wl_dim_size = dimensions[self.wl_dim] # dimension where wordline is + self.bl_dim_size = dimensions[self.bl_dim] # dimension where bitline (adder tree) is + self.nb_of_banks = math.prod([dimensions[oa_dim] for oa_dim in dimensions if oa_dim not in [self.wl_dim, self.bl_dim]]) + # tech_param will be checked and initialized in LogicUnit class + self.logic_unit = LogicUnit(tech_param) + # parameters to be updated in function + self.energy = None + self.energy_breakdown = None + self.area = None + self.area_breakdown = None + self.delay = None + self.delay_breakdown = None + self.mapped_rows_total = None + self.mapped_group_depth = None + + + def check_input(self, hd_param, dimensions): + # check if required_hd_param is provided + # check if there is any negative dimension value + required_hd_param = [ + "imc_type", "input_precision", "weight_precision", "input_bit_per_cycle", "group_depth", + "wordline_dimension", "bitline_dimension", "enable_cacti" + ] + for ii_a, a in enumerate(required_hd_param): + if a not in hd_param.keys(): + raise Exception(f"[ImcArray] Incorrect hd_param, required param [{a}] not found.") + if a == "imc_type": + if hd_param[a] not in ["digital", "analog"]: + raise Exception(f"[ImcArray] Incorrect imc_type in hd_param, either [analog] or [digital] is expected.") + elif a == "wordline_dimension" or a == "bitline_dimension": + if not isinstance(hd_param[a], str) or hd_param[a] not in dimensions.keys(): + raise Exception(f"[ImcArray] param [{a}] is not a str or is not a key in dimensions.") + elif a == "enable_cacti": + if not isinstance(hd_param[a], bool): + raise Exception(f"[ImcArray] param [{a}] is not bool (Ture, False).") + else: + if not (isinstance(hd_param[a], int) or isinstance(hd_param[a], float)): + raise Exception(f"[ImcArray] Incorrect hd_param, value [{hd_param[a]}] of param [{a}] is not a num.") + if hd_param[a] <= 0: + raise Exception(f"[ImcArray] Incorrect hd_param, value [{hd_param[a]}] of param [{a}] is not positive.") + if a == "input_bit_per_cycle" and hd_param[a] > hd_param["input_precision"]: + input_precision = hd_param["input_precision"] + raise Exception(f"[ImcArray] Incorrect hd_param, value [{hd_param[a]}] of param [{a}] is bigger than [input_precision] ({input_precision}).") + for oa_dim in dimensions.keys(): + if dimensions[oa_dim] <= 0: + raise Exception(f"[ImcArray] Incorrect dimensions, value [{dimensions[a]}] of param [{a}] is not a positive number.") + if hd_param["imc_type"] == "analog": + a = "adc_resolution" + if a not in hd_param.keys(): + raise Exception(f"[ImcArray] Incorrect hd_param, required param [{a}] not found.") + # if adc_resolution is not a number or adc_resolution <= 0 + if (not (isinstance(hd_param[a], int) or isinstance(hd_param[a], float))) or (hd_param[a] <= 0): + raise Exception(f"[ImcArray] Incorrect hd_param, value [{hd_param[a]}] of param [{a}] is not a positive number.") + + def get_single_cell_array_cost_from_cacti(self, tech_node, wl_dim_size, bl_dim_size, group_depth, w_pres): + """get the area, energy cost of a single macro (cell array) using CACTI""" + """this function is called when cacti is required for cost estimation""" + """ + @param tech_node: the technology node (e.g. 0.028, 0.032, 0.022 ... unit: um) + @param wl_dim_size: the size of dimension where wordline is. + @param bl_dim_size: the size of dimension where bitline (adder tree) is. + @param group_depth: the size of each cell group (number of SRAM cells on local bitline) + @param w_pres: weight precision (number of SRAM cells required to store a operand) + """ + cell_array_size = wl_dim_size * bl_dim_size * group_depth * w_pres / 8 # array size. unit: byte + array_bw = wl_dim_size * w_pres # imc array bandwidth. unit: bit + + # we will call cacti to get the area (mm^2), access_time (ns), r_cost (nJ/access), w_cost (nJ/access) + if __name__ == "imc_unit": + cacti_path = "../../cacti/cacti_master" + else: + cacti_path = "zigzag/classes/cacti/cacti_master" + access_time, area, r_cost, w_cost = get_cacti_cost(cacti_path=cacti_path, + tech_node=tech_node, + mem_type="sram", + mem_size_in_byte=cell_array_size, + bw=array_bw) + return access_time, area, r_cost, w_cost diff --git a/zigzag/classes/hardware/architecture/operational_array.py b/zigzag/classes/hardware/architecture/operational_array.py index c189c227..a45f6e6b 100644 --- a/zigzag/classes/hardware/architecture/operational_array.py +++ b/zigzag/classes/hardware/architecture/operational_array.py @@ -15,7 +15,10 @@ class OperationalArray: def __init__(self, operational_unit: OperationalUnit, dimensions: Dict[str, int]): self.unit = operational_unit self.total_unit_count = int(np.prod(list(dimensions.values()))) - self.total_area = operational_unit.area * self.total_unit_count + try: + self.total_area = operational_unit.area * self.total_unit_count + except TypeError: # branch for IMC + self.total_area = operational_unit.area base_dims = [ Dimension(idx, name, size) diff --git a/zigzag/classes/opt/spatial/generator.py b/zigzag/classes/opt/spatial/generator.py index 4a7d4ddc..9fb229f6 100644 --- a/zigzag/classes/opt/spatial/generator.py +++ b/zigzag/classes/opt/spatial/generator.py @@ -592,12 +592,13 @@ def add_input_pr_spatial_loop_if_enabled( # keep the spatial loop as it was if it is not weight stationary. if len(layer.constant_operands) > 1: return user_spatial_mapping - # get weight operand name - const_operand = layer.constant_operands[0] # weight representation - # get activation operand name - act_operand = [ - operand for operand in layer.input_operands if operand != const_operand - ][0] + # # get weight operand name + # const_operand = layer.constant_operands[0] # weight representation + # # get activation operand name + # act_operand = [ + # operand for operand in layer.input_operands if operand != const_operand + # ][0] + act_operand, const_operand = self.identify_layer_operand_representation(layer) # get output operand name output_operand = layer.output_operand # get name of OX, OY (weight ir layer dims) @@ -907,3 +908,20 @@ def is_nested_tuple(obj): # If any item within the tuple is itself a tuple, it's a nested tuple return True return False + + @staticmethod + def identify_layer_operand_representation(layer): + # activation representation: list (conv layers) + act_operand = [operand for operand in layer.operand_loop_dim.keys() if + len(layer.operand_loop_dim[operand]["pr"]) > 0] + if len(act_operand) == 0: # true for fully-connected (fc) layers + # weight representation (fc layers) + const_operand = [operand for operand in layer.operand_loop_dim.keys() if + len(layer.operand_loop_dim[operand]["ir"]) == 0][0] + # activation representation (fc layers) + act_operand = [operand for operand in layer.input_operands if operand != const_operand][0] + else: + act_operand = act_operand[0] + # weight representation (conv layers) + const_operand = [operand for operand in layer.input_operands if operand != act_operand][0] + return act_operand, const_operand \ No newline at end of file diff --git a/zigzag/classes/stages/CostModelStage.py b/zigzag/classes/stages/CostModelStage.py index 7ff0a99b..e25a759c 100644 --- a/zigzag/classes/stages/CostModelStage.py +++ b/zigzag/classes/stages/CostModelStage.py @@ -2,6 +2,7 @@ from zigzag.classes.stages.Stage import Stage from zigzag.classes.cost_model.cost_model import CostModelEvaluation +from zigzag.classes.cost_model.cost_model_for_sram_imc import CostModelEvaluationForIMC from zigzag.classes.hardware.architecture.accelerator import Accelerator from zigzag.classes.mapping.spatial.spatial_mapping import SpatialMapping from zigzag.classes.mapping.temporal.temporal_mapping import TemporalMapping @@ -33,7 +34,6 @@ def __init__( spatial_mapping_int, temporal_mapping, access_same_data_considered_as_no_access=True, - cost_model_class=CostModelEvaluation, **kwargs ): super().__init__(list_of_callables, **kwargs) @@ -52,19 +52,33 @@ def __init__( temporal_mapping, access_same_data_considered_as_no_access, ) - self.cost_model_class=cost_model_class ## Run the cost model stage by calling the internal zigzag cost model with the correct inputs. def run(self) -> Generator[Tuple[CostModelEvaluation, Any], None, None]: - self.cme = self.cost_model_class( - accelerator=self.accelerator, - layer=self.layer, - spatial_mapping=self.spatial_mapping, - spatial_mapping_int=self.spatial_mapping_int, - temporal_mapping=self.temporal_mapping, - # the below parameter is optional - access_same_data_considered_as_no_access=self.access_same_data_considered_as_no_access, - ) + core_id = self.layer.core_allocation + core = self.accelerator.get_core(core_id) + operational_array = core.operational_array + pe_type = getattr(operational_array, "pe_type", None) # return None if it does not exist + if pe_type is not None and pe_type in ["in_sram_computing"]: # if pe_type exists and in the list + self.cme = CostModelEvaluationForIMC( + accelerator=self.accelerator, + layer=self.layer, + spatial_mapping=self.spatial_mapping, + spatial_mapping_int=self.spatial_mapping_int, + temporal_mapping=self.temporal_mapping, + # the below parameter is optional + access_same_data_considered_as_no_access=self.access_same_data_considered_as_no_access, + ) + else: + self.cme = CostModelEvaluation( + accelerator=self.accelerator, + layer=self.layer, + spatial_mapping=self.spatial_mapping, + spatial_mapping_int=self.spatial_mapping_int, + temporal_mapping=self.temporal_mapping, + # the below parameter is optional + access_same_data_considered_as_no_access=self.access_same_data_considered_as_no_access, + ) yield (self.cme, None) def is_leaf(self) -> bool: diff --git a/zigzag/classes/stages/SaveStage.py b/zigzag/classes/stages/SaveStage.py index 4ecf4be2..5209d11f 100644 --- a/zigzag/classes/stages/SaveStage.py +++ b/zigzag/classes/stages/SaveStage.py @@ -4,6 +4,8 @@ import os import pickle import json +import yaml +import re import numpy as np import logging @@ -41,6 +43,8 @@ def run(self) -> Generator[Tuple[CostModelEvaluation, Any], None, None]: "?", f"{cme.layer}_complete" ) self.save_to_json(cme, filename=filename) + yamlname = re.split(r"\.", filename)[0] + ".yml" + self.save_to_yaml(jsonname=filename, yamlname=yamlname) logger.info( f"Saved {cme} with energy {cme.energy_total:.3e} and latency {cme.latency_total2:.3e} to {filename}" ) @@ -51,6 +55,13 @@ def save_to_json(self, obj, filename): with open(filename, "w") as fp: json.dump(obj, fp, default=self.complexHandler, indent=4) + def save_to_yaml(self, jsonname, yamlname): + os.makedirs(os.path.dirname(yamlname), exist_ok=True) + with open(jsonname, "r") as fp: + res = json.load(fp) + with open(yamlname, "w") as fp: + yaml.dump(res, fp, Dumper=yaml.SafeDumper) + @staticmethod def complexHandler(obj): # print(type(obj)) diff --git a/zigzag/classes/stages/WorkloadStage.py b/zigzag/classes/stages/WorkloadStage.py index fa7e3233..84aafb01 100644 --- a/zigzag/classes/stages/WorkloadStage.py +++ b/zigzag/classes/stages/WorkloadStage.py @@ -12,16 +12,32 @@ class WorkloadStage(Stage): ## The class constructor # Initialization of self.workload. - def __init__(self, list_of_callables, *, workload, **kwargs): + def __init__(self, list_of_callables, *, workload, accelerator, **kwargs): super().__init__(list_of_callables, **kwargs) self.workload = workload + self.accelerator = accelerator def run(self): for id, layer in enumerate(nx.topological_sort(self.workload)): if type(layer) == DummyNode: continue # skip the DummyNodes + # Skip a layer if the layer type is "Pooling" and the hardware template is an IMC core. + # This wil have impact when the workload is defined manually. + # If the workload is from onnx, no skipping will be done. + core_id = layer.core_allocation + core = self.accelerator.get_core(core_id) + operational_array = core.operational_array + pe_type = getattr(operational_array, "pe_type", None) # return None if it does not exist + try: # branch if the workload is manually defined + layer_type = layer.layer_attrs["operator_type"] + except KeyError: # branch if the workload is from an onnx (key "operator_type" does not exist) + layer_type = None + if (pe_type in ["in_sram_computing"]) and (layer_type in ["Pooling", "Add"]): + continue + kwargs = self.kwargs.copy() kwargs["layer"] = layer + kwargs["accelerator"] = self.accelerator if layer.name: layer_name = layer.name else: diff --git a/zigzag/inputs/examples/hardware/Aimc.py b/zigzag/inputs/examples/hardware/Aimc.py new file mode 100755 index 00000000..3c9b20fb --- /dev/null +++ b/zigzag/inputs/examples/hardware/Aimc.py @@ -0,0 +1,242 @@ +import os, math +import random + +from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy +from zigzag.classes.hardware.architecture.memory_instance import MemoryInstance +from zigzag.classes.hardware.architecture.accelerator import Accelerator +from zigzag.classes.hardware.architecture.core import Core +from zigzag.classes.hardware.architecture.ImcArray import ImcArray +from zigzag.classes.hardware.architecture.get_cacti_cost import get_w_cost_per_weight_from_cacti +from zigzag.classes.hardware.architecture.get_cacti_cost import get_cacti_cost + +# Analog In-Memory Computing (AIMC) core definition +# This example will define an AIMC core with a single macro, sized 32 rows x 32 columns. +# Supported operand precision: 8 bit +# Technology node: 28 nm +# The architecture hierarchy looks like: +# ------- dram (I, W, O) ---------- +# | | +# sram (I, O) cell_group (W) +# |-> reg_I1 (I) --> imc_array <--| +# | | +# | <---> reg_O1 (O) <--> | + +def memory_hierarchy_dut(imc_array, visualize=False): + """ [OPTIONAL] Get w_cost of imc cell group from CACTI if required """ + cacti_path = "zigzag/classes/cacti/cacti_master" + tech_param = imc_array.unit.logic_unit.tech_param + hd_param = imc_array.unit.hd_param + dimensions = imc_array.unit.dimensions + output_precision = hd_param["input_precision"] + hd_param["weight_precision"] + if hd_param["enable_cacti"]: + # unit: pJ/weight writing + w_cost_per_weight_writing = get_w_cost_per_weight_from_cacti(cacti_path, tech_param, hd_param, dimensions) + else: + w_cost_per_weight_writing = hd_param["w_cost_per_weight_writing"] # user-provided value (unit: pJ/weight) + + """Memory hierarchy variables""" + """ size=#bit, bw=(read bw, write bw), cost=(read word energy, write work energy) """ + cell_group = MemoryInstance( + name="cell_group", + size=hd_param["weight_precision"] * hd_param["group_depth"], + r_bw=hd_param["weight_precision"], + w_bw=hd_param["weight_precision"], + r_cost=0, + w_cost=w_cost_per_weight_writing, # unit: pJ/weight + area=0, # this area is already included in imc_array + r_port=0, # no standalone read port + w_port=0, # no standalone write port + rw_port=1, # 1 port for both reading and writing + latency=0, # no extra clock cycle required + ) + reg_I1 = MemoryInstance( + name="rf_I1", + size=hd_param["input_precision"], + r_bw=hd_param["input_precision"], + w_bw=hd_param["input_precision"], + r_cost=0, + w_cost=tech_param["dff_cap"] * (tech_param["vdd"] ** 2) * hd_param["input_precision"], # pJ/access + area=tech_param["dff_area"] * hd_param["input_precision"], # mm^2 + r_port=1, + w_port=1, + rw_port=0, + latency=1, + ) + + reg_O1 = MemoryInstance( + name="rf_O1", + size=output_precision, + r_bw=output_precision, + w_bw=output_precision, + r_cost=0, + w_cost=tech_param["dff_cap"] * (tech_param["vdd"] ** 2) * output_precision, # pJ/access + area=tech_param["dff_area"] * output_precision, # mm^2 + r_port=2, + w_port=2, + rw_port=0, + latency=1, + ) + + ##################################### on-chip memory hierarchy building blocks ##################################### + + sram_size = 256 * 1024 # unit: byte + sram_bw = max(imc_array.unit.bl_dim_size * hd_param["input_precision"] * imc_array.unit.nb_of_banks, + imc_array.unit.wl_dim_size * output_precision * imc_array.unit.nb_of_banks) + ac_time, sram_area, sram_r_cost, sram_w_cost = get_cacti_cost(cacti_path, tech_param["tech_node"], "sram", + sram_size, sram_bw, + hd_hash=str(hash((sram_size, sram_bw, random.randbytes(8))))) + sram_256KB_256_3r_3w = MemoryInstance( + name="sram_256KB", + size=sram_size * 8, # byte -> bit + r_bw=sram_bw, + w_bw=sram_bw, + r_cost=sram_r_cost, + w_cost=sram_w_cost, + area=sram_area, + r_port=3, + w_port=3, + rw_port=0, + latency=1, + min_r_granularity=sram_bw//16, # assume there are 16 sub-banks + min_w_granularity=sram_bw//16, # assume there are 16 sub-banks + ) + + ####################################################################################################################### + + dram_size = 1*1024*1024*1024 # unit: byte + dram_ac_cost_per_bit = 3.7 # unit: pJ/bit + dram_bw = imc_array.unit.wl_dim_size * hd_param["weight_precision"] * imc_array.unit.nb_of_banks + dram_100MB_32_3r_3w = MemoryInstance( + name="dram_1GB", + size=dram_size*8, # byte -> bit + r_bw=dram_bw, + w_bw=dram_bw, + r_cost=dram_ac_cost_per_bit*dram_bw, # pJ/access + w_cost=dram_ac_cost_per_bit*dram_bw, # pJ/access + area=0, + r_port=3, + w_port=3, + rw_port=0, + latency=1, + min_r_granularity=dram_bw // 16, # assume there are 16 sub-banks + min_w_granularity=dram_bw // 16, # assume there are 16 sub-banks + ) + + memory_hierarchy_graph = MemoryHierarchy(operational_array=imc_array) + + """ + fh: from high = wr_in_by_high + fl: from low = wr_in_by_low + th: to high = rd_out_to_high + tl: to low = rd_out_to_low + """ + memory_hierarchy_graph.add_memory( + memory_instance=cell_group, + operands=("I2",), + port_alloc=({"fh": "rw_port_1", "tl": "rw_port_1", "fl": None, "th": None},), + served_dimensions=set(), + ) + memory_hierarchy_graph.add_memory( + memory_instance=reg_I1, + operands=("I1",), + port_alloc=({"fh": "w_port_1", "tl": "r_port_1", "fl": None, "th": None},), + served_dimensions={(1, 0, 0)}, + ) + memory_hierarchy_graph.add_memory( + memory_instance=reg_O1, + operands=("O",), + port_alloc=( + {"fh": "w_port_1", "tl": "r_port_1", "fl": "w_port_2", "th": "r_port_2"},), + served_dimensions={(0, 1, 0)}, + ) + + ##################################### on-chip highest memory hierarchy initialization ##################################### + + memory_hierarchy_graph.add_memory( + memory_instance=sram_256KB_256_3r_3w, + operands=("I1","O",), + port_alloc=( + {"fh": "w_port_1", "tl": "r_port_1", "fl": None, "th": None}, + {"fh": "w_port_2", "tl": "r_port_2", "fl": "w_port_3", "th": "r_port_3"}, + ), + served_dimensions="all", + ) + + #################################################################################################################### + + memory_hierarchy_graph.add_memory( + memory_instance=dram_100MB_32_3r_3w, + operands=("I1", "I2", "O"), + port_alloc=( + {"fh": "w_port_1", "tl": "r_port_1", "fl": None, "th": None}, + {"fh": "w_port_2", "tl": "r_port_2", "fl": None, "th": None}, + {"fh": "w_port_1", "tl": "r_port_1", "fl": "w_port_3", "th": "r_port_3"}, + ), + served_dimensions="all", + ) + + if visualize: + from zigzag.visualization.graph.memory_hierarchy import ( + visualize_memory_hierarchy_graph, + ) + + visualize_memory_hierarchy_graph(memory_hierarchy_graph) + return memory_hierarchy_graph + + +def imc_array_dut(): + """Multiplier array variables""" + tech_param = { # 28nm + "tech_node":0.028, # unit: um + "vdd": 0.9, # unit: V + "nd2_cap": 0.7 / 1e3, # unit: pF + "xor2_cap": 0.7 * 1.5 / 1e3, # unit: pF + "dff_cap": 0.7 * 3 / 1e3, # unit: pF + "nd2_area": 0.614 / 1e6, # unit: mm^2 + "xor2_area":0.614 * 2.4 / 1e6, # unit: mm^2 + "dff_area": 0.614 * 6 / 1e6, # unit: mm^2 + "nd2_dly": 0.0478, # unit: ns + "xor2_dly": 0.0478 * 2.4, # unit: ns + # "dff_dly": 0.0478*3.4, # unit: ns + } + hd_param = { + "pe_type": "in_sram_computing", # for in-memory-computing. Digital core for different values. + "imc_type": "analog", # "digital" or "analog" + "input_precision": 8, # activation precision + "weight_precision": 8, # weight precision + "input_bit_per_cycle": 2, # nb_bits of input/cycle (treated as DAC resolution) + "group_depth": 1, # #cells/multiplier + "adc_resolution": 8, # ADC resolution + "wordline_dimension": "D1", # hardware dimension where wordline is (corresponds to the served dimension of input regs) + "bitline_dimension": "D2", # hardware dimension where bitline is (corresponds to the served dimension of output regs) + "enable_cacti": True, # use CACTI to estimated cell array area cost (cell array exclude build-in logic part) + # Energy of writing weight. Required when enable_cacti is False. + # "w_cost_per_weight_writing": 0.08, # [OPTIONAL] unit: pJ/weight. + } + + + dimensions = { + "D1": 4, # wordline dimension + "D2": 32, # bitline dimension + "D3": 1, # nb_macros (nb_arrays) + } # {"D1": ("K", 4), "D2": ("C", 32),} + hd_param["adc_resolution"] = hd_param["input_bit_per_cycle"] + 0.5 * int(math.log2(dimensions["D2"])) + + aimc_array = ImcArray( + tech_param, hd_param, dimensions + ) + + return aimc_array + +def cores_dut(): + imc_array1 = imc_array_dut() + memory_hierarchy1 = memory_hierarchy_dut(imc_array1) + + core1 = Core(1, imc_array1, memory_hierarchy1) + + return {core1} + + +cores = cores_dut() +acc_name = os.path.basename(__file__)[:-3] +accelerator = Accelerator(acc_name, cores) diff --git a/zigzag/inputs/examples/hardware/Dimc.py b/zigzag/inputs/examples/hardware/Dimc.py new file mode 100644 index 00000000..47464bc9 --- /dev/null +++ b/zigzag/inputs/examples/hardware/Dimc.py @@ -0,0 +1,238 @@ +import os +import random +from zigzag.classes.hardware.architecture.memory_hierarchy import MemoryHierarchy +from zigzag.classes.hardware.architecture.memory_instance import MemoryInstance +from zigzag.classes.hardware.architecture.accelerator import Accelerator +from zigzag.classes.hardware.architecture.core import Core +from zigzag.classes.hardware.architecture.ImcArray import ImcArray +from zigzag.classes.hardware.architecture.get_cacti_cost import get_w_cost_per_weight_from_cacti +from zigzag.classes.hardware.architecture.get_cacti_cost import get_cacti_cost + +# Digital In-Memory Computing (DIMC) core definition +# This example will define an DIMC core with a single macro, sized 32 rows x 32 columns. +# Supported operand precision: 8 bit +# Technology node: 28 nm +# The architecture hierarchy looks like: +# ------- dram (I, W, O) ---------- +# | | +# sram (I, O) cell_group (W) +# |-> reg_I1 (I) --> imc_array <--| +# | | +# | <---> reg_O1 (O) <--> | + +def memory_hierarchy_dut(imc_array, visualize=False): + """ [OPTIONAL] Get w_cost of imc cell group from CACTI if required """ + cacti_path = "zigzag/classes/cacti/cacti_master" + tech_param = imc_array.unit.logic_unit.tech_param + hd_param = imc_array.unit.hd_param + dimensions = imc_array.unit.dimensions + output_precision = hd_param["input_precision"] + hd_param["weight_precision"] + if hd_param["enable_cacti"]: + # unit: pJ/weight writing + w_cost_per_weight_writing = get_w_cost_per_weight_from_cacti(cacti_path, tech_param, hd_param, dimensions) + else: + w_cost_per_weight_writing = hd_param["w_cost_per_weight_writing"] # user-provided value (unit: pJ/weight) + + """Memory hierarchy variables""" + """ size=#bit, bw=(read bw, write bw), cost=(read word energy, write work energy) """ + cell_group = MemoryInstance( + name="cell_group", + size=hd_param["weight_precision"] * hd_param["group_depth"], + r_bw=hd_param["weight_precision"], + w_bw=hd_param["weight_precision"], + r_cost=0, + w_cost=w_cost_per_weight_writing, # unit: pJ/weight + area=0, # this area is already included in imc_array + r_port=0, # no standalone read port + w_port=0, # no standalone write port + rw_port=1, # 1 port for both reading and writing + latency=0, # no extra clock cycle required + ) + reg_I1 = MemoryInstance( + name="rf_I1", + size=hd_param["input_precision"], + r_bw=hd_param["input_precision"], + w_bw=hd_param["input_precision"], + r_cost=0, + w_cost=tech_param["dff_cap"] * (tech_param["vdd"] ** 2) * hd_param["input_precision"], # pJ/access + area=tech_param["dff_area"] * hd_param["input_precision"], # mm^2 + r_port=1, + w_port=1, + rw_port=0, + latency=1, + ) + + reg_O1 = MemoryInstance( + name="rf_O1", + size=output_precision, + r_bw=output_precision, + w_bw=output_precision, + r_cost=0, + w_cost=tech_param["dff_cap"] * (tech_param["vdd"] ** 2) * output_precision, # pJ/access + area=tech_param["dff_area"] * output_precision, # mm^2 + r_port=2, + w_port=2, + rw_port=0, + latency=1, + ) + + ##################################### on-chip memory hierarchy building blocks ##################################### + + sram_size = 256 * 1024 # unit: byte + sram_bw = max(imc_array.unit.bl_dim_size * hd_param["input_precision"] * imc_array.unit.nb_of_banks, + imc_array.unit.wl_dim_size * output_precision * imc_array.unit.nb_of_banks) + ac_time, sram_area, sram_r_cost, sram_w_cost = get_cacti_cost(cacti_path, tech_param["tech_node"], "sram", + sram_size, sram_bw, + hd_hash=str(hash((sram_size, sram_bw, random.randbytes(8))))) + sram_256KB_256_3r_3w = MemoryInstance( + name="sram_256KB", + size=sram_size * 8, # byte -> bit + r_bw=sram_bw, + w_bw=sram_bw, + r_cost=sram_r_cost, + w_cost=sram_w_cost, + area=sram_area, + r_port=3, + w_port=3, + rw_port=0, + latency=1, + min_r_granularity=sram_bw//16, # assume there are 16 sub-banks + min_w_granularity=sram_bw//16, # assume there are 16 sub-banks + ) + + ####################################################################################################################### + + dram_size = 1*1024*1024*1024 # unit: byte + dram_ac_cost_per_bit = 3.7 # unit: pJ/bit + dram_bw = imc_array.unit.wl_dim_size * hd_param["weight_precision"] * imc_array.unit.nb_of_banks + dram_100MB_32_3r_3w = MemoryInstance( + name="dram_1GB", + size=dram_size*8, # byte -> bit + r_bw=dram_bw, + w_bw=dram_bw, + r_cost=dram_ac_cost_per_bit*dram_bw, # pJ/access + w_cost=dram_ac_cost_per_bit*dram_bw, # pJ/access + area=0, + r_port=3, + w_port=3, + rw_port=0, + latency=1, + min_r_granularity=dram_bw // 16, # assume there are 16 sub-banks + min_w_granularity=dram_bw // 16, # assume there are 16 sub-banks + ) + + memory_hierarchy_graph = MemoryHierarchy(operational_array=imc_array) + + """ + fh: from high = wr_in_by_high + fl: from low = wr_in_by_low + th: to high = rd_out_to_high + tl: to low = rd_out_to_low + """ + memory_hierarchy_graph.add_memory( + memory_instance=cell_group, + operands=("I2",), + port_alloc=({"fh": "rw_port_1", "tl": "rw_port_1", "fl": None, "th": None},), + served_dimensions=set(), + ) + memory_hierarchy_graph.add_memory( + memory_instance=reg_I1, + operands=("I1",), + port_alloc=({"fh": "w_port_1", "tl": "r_port_1", "fl": None, "th": None},), + served_dimensions={(1, 0, 0)}, + ) + memory_hierarchy_graph.add_memory( + memory_instance=reg_O1, + operands=("O",), + port_alloc=( + {"fh": "w_port_1", "tl": "r_port_1", "fl": "w_port_2", "th": "r_port_2"},), + served_dimensions={(0, 1, 0)}, + ) + + ##################################### on-chip highest memory hierarchy initialization ##################################### + + memory_hierarchy_graph.add_memory( + memory_instance=sram_256KB_256_3r_3w, + operands=("I1","O",), + port_alloc=( + {"fh": "w_port_1", "tl": "r_port_1", "fl": None, "th": None}, + {"fh": "w_port_2", "tl": "r_port_2", "fl": "w_port_3", "th": "r_port_3"}, + ), + served_dimensions="all", + ) + + #################################################################################################################### + + memory_hierarchy_graph.add_memory( + memory_instance=dram_100MB_32_3r_3w, + operands=("I1", "I2", "O"), + port_alloc=( + {"fh": "w_port_1", "tl": "r_port_1", "fl": None, "th": None}, + {"fh": "w_port_2", "tl": "r_port_2", "fl": None, "th": None}, + {"fh": "w_port_1", "tl": "r_port_1", "fl": "w_port_3", "th": "r_port_3"}, + ), + served_dimensions="all", + ) + + if visualize: + from zigzag.visualization.graph.memory_hierarchy import ( + visualize_memory_hierarchy_graph, + ) + + visualize_memory_hierarchy_graph(memory_hierarchy_graph) + return memory_hierarchy_graph + + +def imc_array_dut(): + """Multiplier array variables""" + tech_param = { # 28nm + "tech_node": 0.028, # unit: um + "vdd": 0.9, # unit: V + "nd2_cap": 0.7/1e3, # unit: pF + "xor2_cap": 0.7*1.5/1e3, # unit: pF + "dff_cap": 0.7*3/1e3, # unit: pF + "nd2_area": 0.614/1e6, # unit: mm^2 + "xor2_area":0.614*2.4/1e6, # unit: mm^2 + "dff_area": 0.614*6/1e6, # unit: mm^2 + "nd2_dly": 0.0478, # unit: ns + "xor2_dly": 0.0478*2.4, # unit: ns + # "dff_dly": 0.0478*3.4, # unit: ns + } + hd_param = { + "pe_type": "in_sram_computing", # for in-memory-computing. Digital core for different values. + "imc_type": "digital", # "digital" or "analog" + "input_precision": 8, # activation precision expected in the hardware + "weight_precision": 8, # weight precision expected in the hardware + "input_bit_per_cycle": 1, # nb_bits of input/cycle/PE + "group_depth": 1, # #cells/multiplier + "wordline_dimension": "D1", # hardware dimension where wordline is (corresponds to the served dimension of input regs) + "bitline_dimension": "D2", # hardware dimension where bitline is (corresponds to the served dimension of output regs) + "enable_cacti": True, # use CACTI to estimated cell array area cost (cell array exclude build-in logic part) + # Energy of writing weight. Required when enable_cacti is False. + # "w_cost_per_weight_writing": 0.08, # [OPTIONAL] unit: pJ/weight. + } + + dimensions = { + "D1": 4, # wordline dimension + "D2": 32, # bitline dimension + "D3": 1, # nb_macros (nb_arrays) + } # e.g. {"D1": ("K", 4), "D2": ("C", 32),} + + imc_array = ImcArray( + tech_param, hd_param, dimensions + ) + + return imc_array + +def cores_dut(): + imc_array1 = imc_array_dut() + memory_hierarchy1 = memory_hierarchy_dut(imc_array1) + + core1 = Core(1, imc_array1, memory_hierarchy1) + + return {core1} + + +cores = cores_dut() +acc_name = os.path.basename(__file__)[:-3] +accelerator = Accelerator(acc_name, cores) diff --git a/zigzag/inputs/examples/mapping/default_imc.py b/zigzag/inputs/examples/mapping/default_imc.py new file mode 100755 index 00000000..2bf26a20 --- /dev/null +++ b/zigzag/inputs/examples/mapping/default_imc.py @@ -0,0 +1,13 @@ +mapping = { + "default": { + "core_allocation": 1, + # "spatial_mapping": {"D1": ("OX", 25), "D2": (("FX", 3), ("FY", 3))}, + "memory_operand_links": {"O": "O", "W": "I2", "I": "I1"}, + "spatial_mapping_hint": {"D1": ["K", "OX"], "D2": ["C", "FX", "FY"]}, + }, + "Add": { # to avoid errors when the workload is manually defined and contains Add layers. + "core_allocation": 1, + "memory_operand_links": {"O": "O", "X": "I2", "Y": "I1"}, + "spatial_mapping_hint": {"D1": ["G"], "D2": ["C"]}, + }, +} diff --git a/zigzag/inputs/examples/workload/mlperf_tiny/deepautoencoder.onnx b/zigzag/inputs/examples/workload/mlperf_tiny/deepautoencoder.onnx new file mode 100644 index 00000000..b1df94b4 Binary files /dev/null and b/zigzag/inputs/examples/workload/mlperf_tiny/deepautoencoder.onnx differ diff --git a/zigzag/inputs/examples/workload/mlperf_tiny/ds_cnn.onnx b/zigzag/inputs/examples/workload/mlperf_tiny/ds_cnn.onnx new file mode 100644 index 00000000..d70b918d Binary files /dev/null and b/zigzag/inputs/examples/workload/mlperf_tiny/ds_cnn.onnx differ diff --git a/zigzag/inputs/examples/workload/mlperf_tiny/mobilenet_v1.onnx b/zigzag/inputs/examples/workload/mlperf_tiny/mobilenet_v1.onnx new file mode 100644 index 00000000..c50f4d60 Binary files /dev/null and b/zigzag/inputs/examples/workload/mlperf_tiny/mobilenet_v1.onnx differ diff --git a/zigzag/inputs/examples/workload/mlperf_tiny/resnet8.onnx b/zigzag/inputs/examples/workload/mlperf_tiny/resnet8.onnx new file mode 100644 index 00000000..e36519b4 Binary files /dev/null and b/zigzag/inputs/examples/workload/mlperf_tiny/resnet8.onnx differ diff --git a/zigzag/inputs/validation/hardware/sram_imc/README.md b/zigzag/inputs/validation/hardware/sram_imc/README.md new file mode 100755 index 00000000..7aeb9977 --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/README.md @@ -0,0 +1,46 @@ +## In-Memory Computing Model Extraction and Validation +This folder is where we did cost model extraction and validation for AIMC and DIMC. + +To see the validation details, +for AIMC model, you can run `python aimc_validation.py` under folder `aimc_validation/22-28nm/`. +For DIMC model, you can run `python model_extraction_28nm.py` under folder `dimc_validation/28nm/`, which will extract the best fitting value for energy/area/delay (tclk) model and the corresponding mismatch. +You can also run `python dimc_validation.py`, which will get the mismatch value and cost breakdown for each validated work. + +## Cost Model Overview +Our SRAM-based In-Memory Computing model is a versatile, parameterized model designed to cater to both Analog IMC and Digital IMC. +Since hardware costs are technology-node dependent, we have performed special calibration for the 28nm technology node. The model has been validated against 7 chips from the literature. +A summary of the hardware settings for these chips is provided in the following table. + +| source | label | Bi/Bo/Bcycle | macro size | #cell_group | nb_of_macros | +|-----------------------------------------------------------------|-------|-----------------------------------------------|----------------|-------------|--------------| +| [paper](https://ieeexplore.ieee.org/abstract/document/9431575) | AIMC1 | 7 / 2 / 7 | 1024×512 | 1 | 1 | +| [paper](https://ieeexplore.ieee.org/abstract/document/9896828) | AIMC2 | 8 / 8 / 2 | 16×12 | 32 | 1 | +| [paper](https://ieeexplore.ieee.org/abstract/document/10067289) | AIMC3 | 8 / 8 / 1 | 64×256 | 1 | 8 | +| [paper](https://ieeexplore.ieee.org/abstract/document/9731762) | DIMC1 | 8 / 8 / 2 | 32×6 | 1 | 64 | +| [paper](https://ieeexplore.ieee.org/abstract/document/9731545) | DIMC2 | 8 / 8 / 1 | 32×1 | 16 | 2 | +| [paper](https://ieeexplore.ieee.org/abstract/document/10067260) | DIMC3 | 8 / 8 / 2 | 128×8 | 8 | 8 | +| [paper](https://ieeexplore.ieee.org/abstract/document/10067779) | DIMC4 | 8 / 8 / 1 | 128×8 | 2 | 4 | + +Bi/Bo/Bcycle: input precision/weight precision/number of bits processed per cycle per input. +#cell_group: the number of cells sharing one entry to computation logic. + +The validation results are displayed in the figure below (assuming 50% input toggle rate and 50% weight sparsity are assumed). +The gray bar represents the reported performance value, while the colored bar represents the model estimation. +The percent above the bars is the ratio between model estimation and the chip measurement results. + +

+imc model validation plot +

+ +- AIMC1 incurs additional area costs due to repeaters/decaps. +- Sparsity information is not available for AIMC2, DIMC2, DIMC4. +- AIMC1, AIMC3 were fabricated using 22nm technology, therefore the cost estimation was scaled accordingly. + +**Note:** + +The current integrated IMC model has certain limitations and is applicable only under the following conditions: +- The SRAM cell is a 6T memory cell. +- The adder tree follows a RCA (Ripple Carry Adder) structure without any approximation logic. +- The operands are of integer type rather than floating point. +- The voltage used for the delay estimation is fixed at 0.9 V. +- Sparsity impact is not included in the estimated energy cost. diff --git a/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc1_validation_subfunc.py b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc1_validation_subfunc.py new file mode 100755 index 00000000..84acc9bc --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc1_validation_subfunc.py @@ -0,0 +1,137 @@ +from aimc_cost_model import ADC, DAC +from dimc_cost_model import UnitDff, MultiplierArray, MemoryInstance + +def aimc1_cost_estimation(aimc, cacti_value): + unit_reg = UnitDff(aimc['unit_area'], aimc['unit_delay'], aimc['unit_cap']) + unit_area = aimc['unit_area'] + unit_delay = aimc['unit_delay'] + unit_cap = aimc['unit_cap'] + input_channel = aimc['input_channel'] + reg_input_bitwidth = aimc['reg_input_bitwidth'] + input_bandwidth = input_channel * aimc['input_precision'] + output_bandwidth_per_channel = aimc['output_precision'] + """ + multiplier array for each output channel + """ + mults = MultiplierArray(vdd=aimc['vdd'],input_precision=int(aimc['multiplier_precision']),number_of_multiplier=input_channel, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + adder_tree for each output channel + """ + adder_tree = None + + """ + accumulator for each output channel + """ + accumulator = None + + """ + ADC cost for each output channel + """ + adc = ADC(resolution=aimc['adc_resolution'], ICH=aimc['input_channel']) + + """ + DAC cost for each input channel + """ + dac = DAC(resolution=aimc['dac_resolution']) + + """ + memory instance (delay unit: ns, energy unit: fJ, area unit: mm2) + unitbank: sram bank, data from CACTI + regs_input: input register files + regs_output: output register files for each output channel + regs_accumulator: register files inside accumulator for each output channel (congifuration is same with regs_output) + """ + unitbank = MemoryInstance(name='unitbank', size=aimc['rows']*aimc['cols'], r_bw=aimc['cols'], w_bw=aimc['cols'], delay=cacti_value['delay']*0, r_energy=cacti_value['r_energy'], w_energy=cacti_value['w_energy'], area=cacti_value['area'], r_port=1, w_port=1, rw_port=0, latency=0) + energy_wl = 0 # per output channel + energy_bl = aimc['input_channel'] * aimc['unit_cap']/2*2 * aimc['vdd']**2 # per output channel (aimc['unit_cap']/2 for bitline cap/cell, *2 for 2 bitline port of 2 cells connecting together) + energy_en = aimc['input_channel'] * aimc['unit_cap']/2 * aimc['vdd']**2 # per output channel (energy cost on "csbias" enable signal) + + + """ + calculate result + :predicted_area: The area cost for entire IMC core (unit: mm2) + :predicted_delay: The minimum delay of single clock period (unit: ns) + :predicted_energy_per_cycle: The energy cost each time the IMC core is activated (unit: fJ) + :number_of_cycle: The number of cycle for computing entire input + :predicted_energy: The energy cost for computing entire input (unit: fJ) + :number_of_operations: The number of operations executed when computing entire input + :predicted_tops: Peak TOP/s + :predicted_topsw: Peak TOP/s/W + """ + + ## Area cost breakdown + area_mults = aimc['banks'] * aimc['output_channel'] * mults.calculate_area() + area_adder_tree = 0 + area_accumulator = 0 + area_banks = aimc['banks'] * 2*unitbank.area # 2 for pulse generators (repeators in papers) (it's an assumption) + area_regs_accumulator = 0 + area_regs_pipeline = 0 + area_adc = aimc['banks'] * aimc['output_channel'] * adc.calculate_area() + area_dac = aimc['banks'] * aimc['input_channel'] * dac.calculate_area() + + # (for beyong ADC/DAC part, scale from 28nm -> 22nm, exclude ADC/DAC, which is assumed indepedent from tech.) (assume linear) + area_mults = area_mults/28*22 + area_adder_tree = area_adder_tree/28*22 + area_accumulator = area_accumulator/28*22 + area_banks = area_banks # the area is for 22 nm + area_regs_accumulator = area_regs_accumulator/28*22 + area_regs_pipeline = area_regs_pipeline/28*22 + area_adc = area_adc/28*22 + area_dac = area_dac/28*22 + + predicted_area = area_mults + area_adder_tree + area_accumulator + area_banks + area_regs_accumulator + area_regs_pipeline + area_adc + area_dac# cost of input/output regs has been taken out # (scale from 22nm -> 28nm, exclude ADC/DAC, which is assumed indepedent from tech.) (assume linear) + + ## delay cost + predicted_delay = unitbank.delay + mults.calculate_delay() + adc.calculate_delay() + + ## Energy cost breakdown per cycle + energy_mults = (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * mults.calculate_energy() + energy_adder_tree = 0 + energy_accumulator = 0 + energy_banks = (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * (energy_wl + energy_bl + energy_en) + energy_regs_accumulator = 0 + energy_regs_pipeline = 0 + energy_adc = (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * adc.calculate_energy(vdd=aimc['vdd']) + energy_dac = aimc['banks'] * aimc['input_channel'] * dac.calculate_energy(vdd=aimc['vdd'], k0=aimc['dac_energy_k0']) + + # (for beyong ADC/DAC part, scale from 28nm -> 22nm, exclude ADC/DAC, which is assumed indepedent from tech.) (assume linear) + energy_mults = energy_mults/28*22 + energy_adder_tree = energy_adder_tree/28*22 + energy_accumulator = energy_accumulator/28*22 + energy_banks = energy_banks/28*22 + energy_regs_accumulator = energy_regs_accumulator/28*22 + energy_regs_pipeline = energy_regs_pipeline/28*22 + + + predicted_energy_per_cycle = energy_mults + energy_adder_tree + energy_accumulator + energy_banks + energy_regs_accumulator + energy_regs_pipeline + energy_adc + energy_dac + + number_of_cycle = aimc['activation_precision']/aimc['input_precision'] + + predicted_energy = predicted_energy_per_cycle * number_of_cycle + + number_of_operations = 2*aimc['banks']*aimc['output_channel']*aimc['input_channel'] # 1MAC = 2 Operations + + predicted_tops = number_of_operations/(predicted_delay*number_of_cycle) / (10**3) + predicted_topsw = number_of_operations/predicted_energy * 10**3 + + ## Energy breakdown per MAC + number_of_mac = number_of_operations/2 + energy_mults_mac = energy_mults * number_of_cycle/number_of_mac + energy_adder_tree_mac = 0 + energy_accumulator_mac = 0 + energy_banks_mac = energy_banks * number_of_cycle/number_of_mac + energy_regs_accumulator_mac = 0 + energy_regs_pipeline_mac = 0 + energy_adc_mac = energy_adc * number_of_cycle/number_of_mac + energy_dac_mac = energy_dac * number_of_cycle/number_of_mac + energy_estimation_per_mac = predicted_energy/number_of_mac + energy_reported_per_mac = 2000/aimc['TOP/s/W'] + + area_mismatch = abs(predicted_area/aimc['area']-1) + delay_mismatch = abs(predicted_delay/aimc['tclk']-1) + energy_mismatch = abs(energy_estimation_per_mac/energy_reported_per_mac-1) + #return predicted_area, predicted_delay, energy_estimation_per_mac + #return area_mismatch, delay_mismatch, energy_mismatch + #print(area_mults, area_adder_tree, area_accumulator, area_banks, area_regs_accumulator, area_regs_pipeline) + #print(energy_mults_mac, energy_adder_tree_mac, energy_accumulator_mac, energy_banks_mac, energy_regs_accumulator_mac, energy_regs_pipeline_mac) diff --git a/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc2_validation_subfunc.py b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc2_validation_subfunc.py new file mode 100755 index 00000000..5e56ef4c --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc2_validation_subfunc.py @@ -0,0 +1,142 @@ +from aimc_cost_model import ADC, DAC +from dimc_cost_model import UnitNand2, UnitDff, MultiplierArray, Adder, AdderTree, MemoryInstance + +def aimc2_cost_estimation(aimc, cacti_value): + unit_reg = UnitDff(aimc['unit_area'], aimc['unit_delay'], aimc['unit_cap']) + unit_area = aimc['unit_area'] + unit_delay = aimc['unit_delay'] + unit_cap = aimc['unit_cap'] + input_channel = aimc['input_channel'] + reg_input_bitwidth = aimc['reg_input_bitwidth'] + input_bandwidth = input_channel * aimc['input_precision'] + output_bandwidth_per_channel = aimc['output_precision'] + """ + multiplier array for each output channel + """ + col_mux = 2 + mults = MultiplierArray(vdd=aimc['vdd'],input_precision=int(aimc['multiplier_precision']),number_of_multiplier=col_mux*input_channel, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + adder_tree for each output channel + """ + # adder tree with place value + adder1 = Adder(vdd=aimc['vdd'], input_precision=7, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) # 8 in total + adder2 = Adder(vdd=aimc['vdd'], input_precision=9, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) # 4 in total + adder3 = Adder(vdd=aimc['vdd'], input_precision=12, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) # 2 in total + adder4 = Adder(vdd=aimc['vdd'], input_precision=15, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) # 1 in total + adder_tree = AdderTree(vdd=aimc['vdd'], input_precision=int(aimc['adder_input_precision']), number_of_input=input_channel, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + accumulator for each output channel + """ + accumulator = Adder(vdd=aimc['vdd'], input_precision=int(aimc['accumulator_precision']), unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + ADC cost for each ADC + """ + adc = ADC(resolution=aimc['adc_resolution'], ICH=aimc['input_channel']) + + """ + DAC cost for each DAC + """ + dac = DAC(resolution=aimc['dac_resolution']) + + """ + memory instance (delay unit: ns, energy unit: fJ, area unit: mm2) + unitbank: sram bank, data from CACTI + regs_accumulator: register files inside accumulator for each output channel (congifuration is same with regs_output) + """ + unitbank = MemoryInstance(name='unitbank', size=aimc['rows']*aimc['cols'], r_bw=aimc['cols'], w_bw=aimc['cols'], delay=cacti_value['delay']*0, r_energy=cacti_value['r_energy'], w_energy=cacti_value['w_energy'], area=cacti_value['area'], r_port=1, w_port=1, rw_port=0, latency=0) + regs_accumulator = MemoryInstance(name='regs_accumulator', size=aimc['reg_accumulator_precision'], r_bw=aimc['reg_accumulator_precision'], w_bw=aimc['reg_accumulator_precision'], delay=unit_reg.calculate_delay(), r_energy=0, w_energy=unit_reg.calculate_cap() * aimc['vdd']**2 * aimc['reg_accumulator_precision'], area=unit_reg.calculate_area()*aimc['reg_accumulator_precision'], r_port=1, w_port=1, rw_port=0, latency=0) + regs_pipeline = MemoryInstance(name='regs_pipeline', size=5*16, r_bw=5*16, w_bw=5*16, delay=0, r_energy=0, w_energy=unit_reg.calculate_cap() * aimc['vdd']**2 * 5 * 16, area=unit_reg.calculate_area()*5*16, r_port=1, w_port=1, rw_port=0, latency=1) + + energy_wl = aimc['input_channel'] * aimc['unit_cap']/2*2 * aimc['vdd']**2 * aimc['weight_precision'] # per output channel + #energy_bl = aimc['rows'] * aimc['unit_cap']/2*2 * aimc['vdd']**2 * aimc['weight_precision'] # per output channel (aimc['unit_cap']/2 for bitline cap/cell, *2 for 2 bitline port of 2 cells connecting together) + energy_en = aimc['input_channel'] * aimc['unit_cap']/2*2 * aimc['vdd']**2 # per output channel (energy cost on "en" enable signal) + energy_bl = 0 # assume bitline doesn't change during computation + + """ + calculate result + :predicted_area: The area cost for entire IMC core (unit: mm2) + :predicted_delay: The minimum delay of single clock period (unit: ns) + :predicted_energy_per_cycle: The energy cost each time the IMC core is activated (unit: fJ) + :number_of_cycle: The number of cycle for computing entire input + :predicted_energy: The energy cost for computing entire input (unit: fJ) + :number_of_operations: The number of operations executed when computing entire input + :predicted_tops: Peak TOP/s + :predicted_topsw: Peak TOP/s/W + """ + + ## Area cost breakdown + area_mults = aimc['banks'] * aimc['output_channel'] * mults.calculate_area() + #area_adder_tree = aimc['banks'] * aimc['output_channel'] * adder_tree.calculate_area() + area_adder_tree = aimc['banks'] * aimc['output_channel'] * ( 8*adder1.calculate_area() + 4*adder2.calculate_area() + 2*adder3.calculate_area() + 1*adder4.calculate_area() ) + area_accumulator = aimc['banks'] * aimc['output_channel'] * accumulator.calculate_area() + area_banks = aimc['banks'] *unitbank.area + area_regs_accumulator = aimc['banks'] * aimc['output_channel'] * regs_accumulator.area + area_regs_pipeline = aimc['banks'] * aimc['output_channel'] * regs_pipeline.area + area_adc = aimc['banks'] * aimc['output_channel'] * 16 * adc.calculate_area() + area_dac = aimc['banks'] * 2 * aimc['input_channel'] * dac.calculate_area() + + + predicted_area = area_mults + area_adder_tree + area_accumulator + area_banks + area_regs_accumulator + area_regs_pipeline + area_adc + area_dac# cost of input/output regs has been taken out + + ## delay cost (2* for input transfer two times) + adder_1b_carry_delay = 2*UnitNand2(unit_area, unit_delay, unit_cap).calculate_delay() + accumulator_delay = accumulator.calculate_delay_lsb()+adder_1b_carry_delay * (aimc['reg_accumulator_precision']-aimc['accumulator_input_precision']) + predicted_delay = max(2* (unitbank.delay + mults.calculate_delay() + adc.calculate_delay()), 2*(adder_tree.calculate_delay() + accumulator_delay)) + + ## Energy cost breakdown per input transfer + energy_mults = (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * mults.calculate_energy() + #energy_adder_tree = (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * adder_tree.calculate_energy() + energy_adder_tree = (1 - aimc['weight_sparsity']) * aimc['banks'] * aimc[ + 'output_channel'] * ( 8*adder1.calculate_energy() + 4*adder2.calculate_energy() + 2*adder3.calculate_energy() + 1*adder4.calculate_energy() ) + energy_accumulator = aimc['banks'] * aimc['output_channel'] * accumulator.calculate_energy() + energy_banks = (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * (energy_wl + energy_bl + energy_en) + energy_regs_accumulator = aimc['banks'] * aimc['output_channel'] * regs_accumulator.w_energy + energy_regs_pipeline = aimc['banks'] * aimc['output_channel'] * regs_pipeline.w_energy + energy_adc = (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * 16 * adc.calculate_energy(vdd=aimc['vdd']) + energy_dac = aimc['banks'] * 2 * aimc['input_channel'] * dac.calculate_energy(vdd=aimc['vdd'], k0=aimc['dac_energy_k0']) + + ## 2* for input transfer two times + energy_mults *= 2 + energy_adder_tree *= 2 + energy_accumulator *= 2 + energy_banks *= 2 + energy_regs_accumulator *= 2 + energy_regs_pipeline *= 2 + energy_adc *= 2 + energy_dac *= 2 + + + predicted_energy_per_cycle = energy_mults + energy_adder_tree + energy_accumulator + energy_banks + energy_regs_accumulator + energy_regs_pipeline + energy_adc + energy_dac + + number_of_cycle = aimc['activation_precision']/aimc['input_precision'] + + predicted_energy = predicted_energy_per_cycle * number_of_cycle + + number_of_operations = 2*aimc['banks']*aimc['output_channel']*aimc['input_channel'] # 1MAC = 2 Operations + + predicted_tops = number_of_operations/(predicted_delay*number_of_cycle) / (10**3) + predicted_topsw = number_of_operations/predicted_energy * 10**3 + + ## Energy breakdown per MAC + number_of_mac = number_of_operations/2 + energy_mults_mac = energy_mults * number_of_cycle/number_of_mac + energy_adder_tree_mac = energy_adder_tree * number_of_cycle/number_of_mac + energy_accumulator_mac = energy_accumulator * number_of_cycle/number_of_mac + energy_banks_mac = energy_banks * number_of_cycle/number_of_mac + energy_regs_accumulator_mac = energy_regs_accumulator * number_of_cycle/number_of_mac + energy_regs_pipeline_mac = energy_regs_pipeline * number_of_cycle/number_of_mac + energy_adc_mac = energy_adc * number_of_cycle/number_of_mac + energy_dac_mac = energy_dac * number_of_cycle/number_of_mac + energy_estimation_per_mac = predicted_energy/number_of_mac + energy_reported_per_mac = 2000/aimc['TOP/s/W'] + + area_mismatch = abs(predicted_area/aimc['area']-1) + delay_mismatch = abs(predicted_delay/aimc['tclk']-1) + energy_mismatch = abs(energy_estimation_per_mac/energy_reported_per_mac-1) + #return predicted_area, predicted_delay, energy_estimation_per_mac + #return area_mismatch, delay_mismatch, energy_mismatch + #print(area_mults, area_adder_tree, area_accumulator, area_banks, area_regs_accumulator, area_regs_pipeline) + #print(energy_mults_mac, energy_adder_tree_mac, energy_accumulator_mac, energy_banks_mac, energy_regs_accumulator_mac, energy_regs_pipeline_mac) diff --git a/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc3_validation_subfunc.py b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc3_validation_subfunc.py new file mode 100755 index 00000000..e0e94e5f --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc3_validation_subfunc.py @@ -0,0 +1,145 @@ +from aimc_cost_model import ADC, DAC +from dimc_cost_model import UnitDff, MultiplierArray, Adder, MemoryInstance + +def aimc3_cost_estimation(aimc, cacti_value): + unit_reg = UnitDff(aimc['unit_area'], aimc['unit_delay'], aimc['unit_cap']) + unit_area = aimc['unit_area'] + unit_delay = aimc['unit_delay'] + unit_cap = aimc['unit_cap'] + input_channel = aimc['input_channel'] + reg_input_bitwidth = aimc['reg_input_bitwidth'] + input_bandwidth = input_channel * aimc['input_precision'] + output_bandwidth_per_channel = aimc['output_precision'] + """ + multiplier array for each output channel + """ + mults = MultiplierArray(vdd=aimc['vdd'],input_precision=int(aimc['multiplier_precision']),number_of_multiplier=64, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + mults_energy = MultiplierArray(vdd=aimc['vdd'],input_precision=int(aimc['multiplier_precision']),number_of_multiplier=16, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) # mapped multipliers + + """ + adder_tree for each output channel + """ + adder_tree = Adder(vdd=aimc['vdd'], input_precision=6, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + accumulator for each output channel + """ + accumulator = None + + """ + ADC cost for each ADC + """ + adc = ADC(resolution=aimc['adc_resolution'], ICH=aimc['input_channel']) + adc_area = ADC(resolution=7, ICH=64) # for estimating area + + """ + DAC cost for each DAC + """ + dac = None + + """ + memory instance (delay unit: ns, energy unit: fJ, area unit: mm2) + unitbank: sram bank, data from CACTI + regs_accumulator: register files inside accumulator for each output channel (congifuration is same with regs_output) + """ + unitbank = MemoryInstance(name='unitbank', size=aimc['rows']*aimc['cols'], r_bw=aimc['cols'], w_bw=aimc['cols'], delay=cacti_value['delay']*0, r_energy=cacti_value['r_energy'], w_energy=cacti_value['w_energy'], area=cacti_value['area'], r_port=1, w_port=1, rw_port=0, latency=0) + regs_accumulator = None + regs_pipeline = None + + energy_wl = 0 # per output channel + # for energy_bl, there are 64 rows in total, but only 8 are used. + energy_bl = aimc['input_channel'] * aimc['unit_cap']/2 * aimc['vdd']**2 * aimc['weight_precision'] # per output channel (aimc['unit_cap']/2 for bitline cap/cell, *2 for 2 bitline port of 2 cells connecting together) + energy_en = 0 # per output channel (no enable signal) (for cap couplling-based AIMC, no enable signal is required, as long as the input sequence timing is gated when not under computation.) + + + """ + calculate result + :predicted_area: The area cost for entire IMC core (unit: mm2) + :predicted_delay: The minimum delay of single clock period (unit: ns) + :predicted_energy_per_cycle: The energy cost each time the IMC core is activated (unit: fJ) + :number_of_cycle: The number of cycle for computing entire input + :predicted_energy: The energy cost for computing entire input (unit: fJ) + :number_of_operations: The number of operations executed when computing entire input + :predicted_tops: Peak TOP/s + :predicted_topsw: Peak TOP/s/W + """ + + ## Area cost breakdown + area_mults = aimc['banks'] * aimc['output_channel'] * mults.calculate_area() + area_adder_tree = aimc['banks'] * aimc['output_channel'] * adder_tree.calculate_area() + area_accumulator = 0 + if aimc['compact_rule'] == False: + area_banks = aimc['banks'] * 3*unitbank.area # 2 for non-compact rule scaling + else: + area_banks = aimc['banks'] *unitbank.area + area_regs_accumulator = 0 + area_regs_pipeline = 0 + area_adc = aimc['banks'] * aimc['output_channel'] * adc_area.calculate_area() + area_dac = 0 # =0 + + # (for beyong ADC/DAC part, scale from 28nm -> 22nm, exclude ADC/DAC, which is assumed indepedent from tech.) (assume linear) + area_mults = area_mults/28*22 + area_adder_tree = area_adder_tree/28*22 + area_accumulator = area_accumulator/28*22 + area_banks = area_banks/28*22 + area_regs_accumulator = area_regs_accumulator/28*22 + area_regs_pipeline = area_regs_pipeline/28*22 + area_adc = area_adc/28*22 + area_dac = area_dac/28*22 + + + predicted_area = area_mults + area_adder_tree + area_accumulator + area_banks + area_regs_accumulator + area_regs_pipeline + area_adc + area_dac# cost of input/output regs has been taken out + + ## delay cost + predicted_delay = unitbank.delay + mults.calculate_delay() + adder_tree.calculate_delay_msb() + adc.calculate_delay() + + ## Energy cost breakdown per input transfer + energy_mults = aimc['input_toggle_rate'] * (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * mults_energy.calculate_energy() + energy_adder_tree = (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * adder_tree.calculate_energy() + energy_accumulator = 0 + energy_banks = (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * (energy_wl + energy_bl + energy_en) + energy_regs_accumulator = 0 + energy_regs_pipeline = 0 + energy_adc = (1-aimc['weight_sparsity']) * aimc['banks'] * aimc['output_channel'] * adc.calculate_energy(vdd=aimc['vdd']) + energy_dac = 0 + + # (for beyong ADC/DAC part, scale from 28nm -> 22nm, exclude ADC/DAC, which is assumed indepedent from tech.) (assume linear) + energy_mults = energy_mults/28*22 + energy_adder_tree = energy_adder_tree/28*22 + energy_accumulator = energy_accumulator/28*22 + energy_banks = energy_banks/28*22 + energy_regs_accumulator = energy_regs_accumulator/28*22 + energy_regs_pipeline = energy_regs_pipeline/28*22 + + + predicted_energy_per_cycle = energy_mults + energy_adder_tree + energy_accumulator + energy_banks + energy_regs_accumulator + energy_regs_pipeline + energy_adc + energy_dac + + number_of_cycle = aimc['activation_precision']/aimc['input_precision'] + + predicted_energy = predicted_energy_per_cycle * number_of_cycle + + number_of_operations = 2*aimc['banks']*aimc['output_channel']*aimc['rows']/8 # 1MAC = 2 Operations + + predicted_tops = number_of_operations/(predicted_delay*number_of_cycle) / (10**3) + predicted_topsw = number_of_operations/predicted_energy * 10**3 + + ## Energy breakdown per MAC + number_of_mac = number_of_operations/2 + energy_mults_mac = energy_mults * number_of_cycle/number_of_mac + energy_adder_tree_mac = energy_adder_tree * number_of_cycle/number_of_mac + energy_accumulator_mac = energy_accumulator * number_of_cycle/number_of_mac + energy_banks_mac = energy_banks * number_of_cycle/number_of_mac + energy_regs_accumulator_mac = energy_regs_accumulator * number_of_cycle/number_of_mac + energy_regs_pipeline_mac = 0 + energy_adc_mac = energy_adc * number_of_cycle/number_of_mac + energy_dac_mac = energy_dac * number_of_cycle/number_of_mac + energy_estimation_per_mac = predicted_energy/number_of_mac + energy_reported_per_mac = 2000/aimc['TOP/s/W'] + + area_mismatch = abs(predicted_area/aimc['area']-1) + delay_mismatch = abs(predicted_delay/aimc['tclk']-1) + energy_mismatch = abs(energy_estimation_per_mac/energy_reported_per_mac-1) + #return predicted_area, predicted_delay, energy_estimation_per_mac + #return area_mismatch, delay_mismatch, energy_mismatch + #print(area_mults, area_adder_tree, area_accumulator, area_banks, area_regs_accumulator, area_regs_pipeline) + #print(energy_mults_mac, energy_adder_tree_mac, energy_accumulator_mac, energy_banks_mac, energy_regs_accumulator_mac, energy_regs_pipeline_mac) \ No newline at end of file diff --git a/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc_cost_model.py b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc_cost_model.py new file mode 100755 index 00000000..bd13c741 --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc_cost_model.py @@ -0,0 +1,45 @@ +import math + +class ADC: + """ + Class for a single ADC. + :param resolution: ADC resolution + :param vdd: The supply vdd (unit: V) + :param ICH: The number of input channels on bitline (ADC input node) + """ + def __init__(self, resolution: int, ICH: int): + self.resolution = resolution + self.ICH = ICH + def calculate_area(self): + if self.resolution < 12: + #self.area = 10 ** (-0.25 * self.resolution-3.3) * 2**self.resolution # unit: mm2 + self.area = (10**-6) * 10 ** (-0.0369 * self.resolution+1.206) * 2**self.resolution # unit: mm2 + else: + self.area = 5 * 10**-7 * 2**self.resolution # unit: mm2 + return self.area + def calculate_delay(self): + self.delay = self.resolution * (0.00653*self.ICH+0.640) # ns + return self.delay + def calculate_energy(self, vdd): # unit: fJ + k1 = 100 # fF + k2 = 0.001 # fF + self.energy = (k1 * self.resolution + k2 * 4**self.resolution) * vdd**2 + return self.energy + +class DAC: + """ + Class for a single DAC. + :param resolution: DAC resolution + :param vdd: The supply vdd (unit: V) + """ + def __init__(self, resolution: int): + self.resolution = resolution + def calculate_area(self): + self.area = 0 + return self.area + def calculate_delay(self): + self.delay = 0 + return self.delay + def calculate_energy(self, vdd, k0): # unit: fF + self.energy = (k0 * self.resolution) * vdd**2 + return self.energy \ No newline at end of file diff --git a/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc_validation.py b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc_validation.py new file mode 100755 index 00000000..e58e2b5c --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/aimc_validation.py @@ -0,0 +1,146 @@ +from aimc1_validation_subfunc import aimc1_cost_estimation +from aimc2_validation_subfunc import aimc2_cost_estimation +from aimc3_validation_subfunc import aimc3_cost_estimation + +""" +CICC2021 (Assume 100% input toggle rate, 0% weight sparsity) +""" +aimc1 = { # https://ieeexplore.ieee.org/document/9431575 (22nm) + 'paper_idx': 'CICC2021', + 'input_toggle_rate': 1, # assumption + 'weight_sparsity': 0, # assumption + 'activation_precision': 7, + 'weight_precision': 2, + 'output_precision': 6, # output precision (unit: bit) + 'input_precision': 7, + 'input_channel': 1024, # how many input in parallel (per bank) + 'output_channel': 512, # how many output in parallel (per bank) + 'adc_resolution': 6, + 'dac_resolution': 7, + 'booth_encoding': False, + 'multiplier_precision': 2, + 'adder_input_precision':None, + 'accumulator_precision':None, + 'reg_accumulator_precision': None, + 'reg_input_bitwidth': None, + 'pipeline': False, + 'vdd': 0.8, # V + 'rows': 1024, # equal to the number of input channels + 'cols': 1024, + 'banks': 1, # number of cores + 'compact_rule': False, # not used + 'area': 1.9425, # mm2 (in code, the area will scale from 28nm -> 22nm) + 'tclk': 1000/22.5, # ns (assume tclk doesn't scale with technology) + 'TOP/s': None, + 'TOP/s/W': 1050, # (in code, the energy will scale from 28nm -> 22nm) + 'unit_area': 0.614, # um2 + 'unit_delay': 0.0478, #ns + 'unit_cap': 0.7, #fF + 'dac_energy_k0': 50 #fF (energy validation fitting parameter, which is taken directly from the value in TinyML paper) + } +cacti1 = { # 131072B, bw: 1024 + 'delay': 0.106473, #ns + 'r_energy': None, # not used + 'w_energy': None, # not used + 'area': 0.24496704 #mm2 + } + +""" +JSSC2023 (Assume 100% input toggle rate, 0% weight sparsity) +""" +aimc2 = { # https://ieeexplore.ieee.org/document/9896828/ (28nm) + 'paper_idx': 'JSSC2023', + 'input_toggle_rate': 1, # assumption + 'weight_sparsity': 0, # assumption + 'activation_precision': 8, + 'weight_precision': 8, + 'output_precision': 20, # output precision (unit: bit) + 'input_precision': 8, + 'input_channel': 16, # how many input in parallel (per bank) + 'output_channel': 12, # how many output in parallel (per bank) + 'adc_resolution': 5, + 'dac_resolution': 2, + 'booth_encoding': False, + 'multiplier_precision': 2, + 'adder_input_precision':12, + 'accumulator_input_precision':16, + 'accumulator_precision':20, + 'reg_accumulator_precision': 20, + 'reg_input_bitwidth': None, + 'pipeline': True, + 'vdd': 0.9, # V + 'rows': 32*16, # equal to the number of input channels + 'cols': 8*2*12, # *2 for column MUX + 'banks': 4, # number of cores + 'compact_rule': True, + 'area': 0.468, # mm2 + 'tclk': 7.2, # ns + 'TOP/s': None, + 'TOP/s/W': 15.02, + 'unit_area': 0.614, # um2 + 'unit_delay': 0.0478, #ns + 'unit_cap': 0.7, #fF + 'dac_energy_k0': 50 #fF + } +cacti2 = { #98304b , bw: 96 + 'delay': 0.16111872, #ns + 'r_energy': None, #fJ @ 0.9V # not used + 'w_energy': None, #fJ @ 0.9V # not used + 'area': 0.0360450648 #mm2 + } + +""" +ISSCC2023, 7.8 (Assume 37.5% input toggle rate, 50% weight sparsity) +""" +aimc3 = { # https://ieeexplore.ieee.org/document/10067289 (22nm) + 'paper_idx': 'ISSCC2023, 7.8', + 'input_toggle_rate': 0.375, # assumption + 'weight_sparsity': 0.5, # assumption + 'activation_precision': 8, + 'weight_precision': 8, + 'output_precision': 24, # output precision (unit: bit) + 'input_precision': 1, + 'input_channel': 8, # how many input in parallel (per bank) + 'output_channel': 256, # how many output in parallel (per bank) + 'adc_resolution': 3, + 'dac_resolution': 0, + 'booth_encoding': False, + 'multiplier_precision': 1, + 'adder_input_precision':None, + 'accumulator_precision':None, + 'reg_accumulator_precision': None, + 'reg_input_bitwidth': None, + 'pipeline': False, + 'vdd': 0.8, # V @ 22nm + 'rows': 64, + 'cols': 256, + 'banks': 8, # number of cores + 'compact_rule': True, + 'area': 1.88, # mm2 (in code, the area will scale from 28nm -> 22nm) + 'tclk': 1000/364, # ns + 'TOP/s': None, + 'TOP/s/W': 18.7, # (in code, the area will scale from 28nm -> 22nm) + 'unit_area': 0.614, # um2 + 'unit_delay': 0.0478, #ns + 'unit_cap': 0.7, #fF + 'dac_energy_k0': 50 #fF + } +cacti3 = { # 64*256, bw: 256 + 'delay': 0.0722227, #ns not used # delay of array will be merged into ADC delay + 'r_energy': None, #fJ @ 0.9V # not used + 'w_energy': None, #fJ @ 0.9V # not used + 'area': 0.004505472 #mm2 + } + + + +if __name__ == '__main__': + """ + For energy fitting, fit: dac_energy_k0 + For area fitting, fit: cell scaling factor (2 for now), constant in ADC formula + For delay fitting, fit: constant in ADC formula + """ +# print(aimc1_cost_estimation(aimc1, cacti1) ) # aimc1 +# print(aimc2_cost_estimation(aimc2, cacti2) ) # aimc2 + print(aimc3_cost_estimation(aimc3, cacti3) ) # aimc3 + diff --git a/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/dimc_cost_model.py b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/dimc_cost_model.py new file mode 100755 index 00000000..0801ffb4 --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/aimc_validation/22-28nm/dimc_cost_model.py @@ -0,0 +1,299 @@ +import math + +class UnitNor2: + """ + Class for a single NOR2 gate. + :param unit_area: The area cost (unit: mm2) + :param unit_delay: The delay cost (unit: ns) + :param unit_cap: The input capacitance including all input ports (unit: fF) + """ + def __init__(self, unit_area: float, unit_delay: float, unit_cap: float): + self.area = unit_area/(10**6) + self.delay = unit_delay + self.cap = unit_cap + def calculate_area(self): + return self.area + def calculate_delay(self): + return self.delay + def calculate_cap(self): + return self.cap + + +class UnitNand2: + """ + Class for a single NAND2 gate. + :param unit_area: The area cost (unit: mm2) + :param unit_delay: The delay cost (unit: ns) + :param unit_cap: The input capacitance including all input ports (unit: fF) + """ + def __init__(self, unit_area: float, unit_delay: float, unit_cap: float): + self.area = unit_area/(10**6) + self.delay = unit_delay + self.cap = unit_cap + def calculate_area(self): + return self.area + def calculate_delay(self): + return self.delay + def calculate_cap(self): + return self.cap + + +class UnitXor2: + """ + Class for a single XOR2 gate. + :param unit_area: The area cost (unit: mm2) + :param unit_delay: The delay cost (unit: ns) + :param unit_cap: The input capacitance including all input ports (unit: fF) + """ + def __init__(self, unit_area: float, unit_delay: float, unit_cap: float): + self.area = unit_area*2.4/(10**6) + self.delay = unit_delay*2.4 + self.cap = unit_cap*1.5 + def calculate_area(self): + return self.area + def calculate_delay(self): + return self.delay + def calculate_cap(self): + return self.cap + + +class UnitDff: + """ + Class for a single 1-b DFF. + :param unit_area: The area cost (unit: mm2) + :param unit_delay: The delay cost (unit: ns) + :param unit_cap: The input capacitance including all input ports (unit: fF) + """ + def __init__(self, unit_area: float, unit_delay: float, unit_cap: float): + self.area = unit_area*6/(10**6) + self.delay = 0 + self.cap = unit_cap*3 + def calculate_area(self): + return self.area + def calculate_delay(self): + return self.delay + def calculate_cap(self): + return self.cap +############################################################################################################### +class Multiplier: + def __init__(self, vdd: float, input_precision: int, unit_area: float, unit_delay: float, unit_cap: float): + """ + Class for a single multiplier that performs 1 bit x multiple bits + :param vdd: The supply voltage (unit: V) + :param input_precision: The bit precision of the input (unit: bit) + :param output_precision: The bit precision of the output (unit: bit) + """ + self.nor2 = UnitNor2(unit_area, unit_delay, unit_cap) + self.vdd = vdd + self.input_precision = input_precision + self.output_precision = input_precision # output precision = input precision + + def calculate_area(self): + """ + area: The area cost (unit: mm2) + """ + area = self.nor2.calculate_area() * self.input_precision + return area + + def calculate_delay(self): + """ + delay: The delay cost (unit: ns) + """ + delay = self.nor2.calculate_delay() + return delay + + def calculate_energy(self): + """ + energy: The energy cost (unit: fJ) + """ + energy = self.nor2.calculate_cap()/2 * self.vdd**2 * self.input_precision # /2 is because only input will change, weight doesn't change + return energy + +class MultiplierArray: + def __init__(self, vdd: float, input_precision: int, number_of_multiplier: int, unit_area: float, unit_delay: float, unit_cap: float): + """ + Class for a single multiplier that performs 1 bit x multiple bits + :param vdd: The supply voltage (unit: V) + :param input_precision: The bit precision of the input (unit: bit) + :param output_precision: The bit precision of the output (unit: bit) + :param number_of_multiplier: The number of multiplier + """ + self.mult = Multiplier(vdd, input_precision, unit_area, unit_delay, unit_cap) + self.vdd = vdd + self.input_precision = input_precision + self.output_precision = input_precision # output precision = input precision + self.number_of_multiplier = number_of_multiplier + + def calculate_area(self): + """ + area: The area cost (unit: mm2) + """ + area = self.mult.calculate_area() * self.number_of_multiplier + return area + + def calculate_delay(self): + """ + delay: The delay cost (unit: ns) + """ + delay = self.mult.calculate_delay() + return delay + + def calculate_energy(self): + """ + energy: The energy cost (unit: fJ) + """ + energy = self.mult.calculate_energy() * self.number_of_multiplier + return energy + + +class Adder: + def __init__(self, vdd: float, input_precision: int, unit_area: float, unit_delay: float, unit_cap: float): + """ + Class for a {input_precision}-b Carry-Ripple Adder + :param vdd: The supply voltage (unit: V) + :param input_precision: The bit precision of the input (unit: bit) + :param output_precision: The bit precision of the output (unit: bit) + :param number_of_1b_adder: The number of 1-b adder in the adder tree + """ + self.nand2 = UnitNand2(unit_area, unit_delay, unit_cap) + self.xor2 = UnitXor2(unit_area, unit_delay, unit_cap) + self.vdd = vdd + self.input_precision = input_precision + self.output_precision = input_precision + 1 + self.number_of_1b_adder = input_precision + + def calculate_area(self): + """ + area: The area cost (unit: mm2) + """ + area = (3*self.nand2.calculate_area() + 2*self.xor2.calculate_area())*self.number_of_1b_adder + return area + + def calculate_delay_lsb(self): + """ + delay: The delay cost for LSB (unit: ns) (best-case delay, also equals to the delay for Tsum of 1-b adder) + """ + delay_sum = 2*self.xor2.calculate_delay() # 2 XOR gate delay (A-to-Sum) + return delay_sum + + def calculate_delay_msb(self): + """ + delay: The delay cost for MSB (unit: ns) (worst-case delay) + """ + delay_carry = (self.xor2.calculate_delay() + 2*self.nand2.calculate_delay()) + (2*self.nand2.calculate_delay()) * (self.input_precision-1) # A-to-Cout -> Cin-to-Count * (precision-1) + return delay_carry + + def calculate_energy(self): + """ + energy: The energy cost (each time it is triggered) (unit: fJ) + """ + energy = (2*self.xor2.calculate_cap() + 3*self.nand2.calculate_cap()) * self.vdd**2 * self.number_of_1b_adder + return energy + + +class AdderTree: + def __init__(self, vdd: float, input_precision: int, number_of_input: int, unit_area: float, unit_delay: float, unit_cap: float): + """ + Class for a {input_number} {input_precision}-b Carry-Ripple Adder Tree + :param vdd: The supply voltage (unit: V) + :param input_precision: The bit precision of the input (unit: bit) + :param number_of_input: The number of inputs + :param output_precision: The bit precision of the output (unit: bit) + :param number_of_1b_adder: The number of 1-b adder in the adder tree + """ + if(math.log(number_of_input,2)%1 != 0): + raise ValueError("The number of input for the adder tree is not in the power of 2. Currently it is: %s" %number_of_input) + self.vdd = vdd + self.input_precision = input_precision + self.number_of_input = number_of_input + self.depth = int( math.log(number_of_input, 2) ) + self.output_precision = input_precision + self.depth + self.number_of_1b_adder = number_of_input*(input_precision+1)-(input_precision+self.depth+1) + self.unit_area = unit_area + self.unit_delay = unit_delay + self.unit_cap = unit_cap + + def calculate_area(self): + """ + area: The area cost (unit: mm2) + """ + # calculate area iteratively + # area_b = 0 + # for stage_idx in range(0, self.depth): + # single_adder = Adder(self.vdd, self.input_precision+stage_idx) + # area_b += single_adder.calculate_area() * math.ceil( self.number_of_input/(2**(stage_idx+1)) ) + # calculate area directly + area = self.number_of_1b_adder * Adder(vdd=self.vdd, input_precision=1, unit_area=self.unit_area, unit_delay=self.unit_delay, unit_cap=self.unit_cap).calculate_area() + return area + + def calculate_delay(self): + """ + delay: The delay cost (unit: ns) + """ + last_adder = Adder(vdd=self.vdd, input_precision=self.output_precision-1, unit_area=self.unit_area, unit_delay=self.unit_delay, unit_cap=self.unit_cap) + delay = last_adder.calculate_delay_lsb() * (self.depth-1) + last_adder.calculate_delay_msb() + return delay + + def calculate_energy(self): + """ + energy: The energy cost (each time it is triggered) (unit: fJ) + """ + energy = self.number_of_1b_adder * Adder(vdd=self.vdd, input_precision=1, unit_area=self.unit_area, unit_delay=self.unit_delay, unit_cap=self.unit_cap).calculate_energy() + return energy + + + + +class MemoryInstance: + """ + class for: regs (input regs, otuput regs), memory bank (copy from Zigzag code, with area, delay added) + """ + def __init__(self, name: str, size: int, r_bw: int, w_bw: int, delay: float, r_energy: float, w_energy: float, area: float, + r_port: int=1, w_port: int=1, rw_port: int=0, latency: int=1, + min_r_granularity=None, min_w_granularity=None): + """ + Collect all the basic information of a physical memory module. + :param name: memory module name, e.g. 'SRAM_512KB_BW_16b', 'I_RF' + :param size: total memory capacity (unit: bit) + :param r_bw/w_bw: memory bandwidth (or wordlength) (unit: bit/cycle) + :param delay: clock-to-output delay (unit: ns) + :param r_energy/w_energy: memory unit data access energy (unit: fJ) + :param area: memory area (unit: mm2) + :param r_port: number of memory read port + :param w_port: number of memory write port (rd_port and wr_port can work in parallel) + :param rw_port: number of memory port for both read and write (read and write cannot happen in parallel) + :param latency: memory access latency (unit: number of cycles) + """ + self.name = name + self.size = size + self.r_bw = r_bw + self.w_bw = w_bw + self.delay = delay + self.r_energy = r_energy + self.w_energy = w_energy + self.area = area + self.r_port = r_port + self.w_port = w_port + self.rw_port = rw_port + self.latency = latency + if not min_r_granularity: + self.r_bw_min = r_bw + else: + self.r_bw_min = min_r_granularity + if not min_w_granularity: + self.w_bw_min = w_bw + else: + self.w_bw_min = min_w_granularity + + def __jsonrepr__(self): + """ + JSON Representation of this class to save it to a json file. + """ + return self.__dict__ + + def __eq__(self, other: object) -> bool: + return isinstance(other, MemoryInstance) and self.__dict__ == other.__dict__ + + +################ + diff --git a/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_cost_model.py b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_cost_model.py new file mode 100755 index 00000000..0801ffb4 --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_cost_model.py @@ -0,0 +1,299 @@ +import math + +class UnitNor2: + """ + Class for a single NOR2 gate. + :param unit_area: The area cost (unit: mm2) + :param unit_delay: The delay cost (unit: ns) + :param unit_cap: The input capacitance including all input ports (unit: fF) + """ + def __init__(self, unit_area: float, unit_delay: float, unit_cap: float): + self.area = unit_area/(10**6) + self.delay = unit_delay + self.cap = unit_cap + def calculate_area(self): + return self.area + def calculate_delay(self): + return self.delay + def calculate_cap(self): + return self.cap + + +class UnitNand2: + """ + Class for a single NAND2 gate. + :param unit_area: The area cost (unit: mm2) + :param unit_delay: The delay cost (unit: ns) + :param unit_cap: The input capacitance including all input ports (unit: fF) + """ + def __init__(self, unit_area: float, unit_delay: float, unit_cap: float): + self.area = unit_area/(10**6) + self.delay = unit_delay + self.cap = unit_cap + def calculate_area(self): + return self.area + def calculate_delay(self): + return self.delay + def calculate_cap(self): + return self.cap + + +class UnitXor2: + """ + Class for a single XOR2 gate. + :param unit_area: The area cost (unit: mm2) + :param unit_delay: The delay cost (unit: ns) + :param unit_cap: The input capacitance including all input ports (unit: fF) + """ + def __init__(self, unit_area: float, unit_delay: float, unit_cap: float): + self.area = unit_area*2.4/(10**6) + self.delay = unit_delay*2.4 + self.cap = unit_cap*1.5 + def calculate_area(self): + return self.area + def calculate_delay(self): + return self.delay + def calculate_cap(self): + return self.cap + + +class UnitDff: + """ + Class for a single 1-b DFF. + :param unit_area: The area cost (unit: mm2) + :param unit_delay: The delay cost (unit: ns) + :param unit_cap: The input capacitance including all input ports (unit: fF) + """ + def __init__(self, unit_area: float, unit_delay: float, unit_cap: float): + self.area = unit_area*6/(10**6) + self.delay = 0 + self.cap = unit_cap*3 + def calculate_area(self): + return self.area + def calculate_delay(self): + return self.delay + def calculate_cap(self): + return self.cap +############################################################################################################### +class Multiplier: + def __init__(self, vdd: float, input_precision: int, unit_area: float, unit_delay: float, unit_cap: float): + """ + Class for a single multiplier that performs 1 bit x multiple bits + :param vdd: The supply voltage (unit: V) + :param input_precision: The bit precision of the input (unit: bit) + :param output_precision: The bit precision of the output (unit: bit) + """ + self.nor2 = UnitNor2(unit_area, unit_delay, unit_cap) + self.vdd = vdd + self.input_precision = input_precision + self.output_precision = input_precision # output precision = input precision + + def calculate_area(self): + """ + area: The area cost (unit: mm2) + """ + area = self.nor2.calculate_area() * self.input_precision + return area + + def calculate_delay(self): + """ + delay: The delay cost (unit: ns) + """ + delay = self.nor2.calculate_delay() + return delay + + def calculate_energy(self): + """ + energy: The energy cost (unit: fJ) + """ + energy = self.nor2.calculate_cap()/2 * self.vdd**2 * self.input_precision # /2 is because only input will change, weight doesn't change + return energy + +class MultiplierArray: + def __init__(self, vdd: float, input_precision: int, number_of_multiplier: int, unit_area: float, unit_delay: float, unit_cap: float): + """ + Class for a single multiplier that performs 1 bit x multiple bits + :param vdd: The supply voltage (unit: V) + :param input_precision: The bit precision of the input (unit: bit) + :param output_precision: The bit precision of the output (unit: bit) + :param number_of_multiplier: The number of multiplier + """ + self.mult = Multiplier(vdd, input_precision, unit_area, unit_delay, unit_cap) + self.vdd = vdd + self.input_precision = input_precision + self.output_precision = input_precision # output precision = input precision + self.number_of_multiplier = number_of_multiplier + + def calculate_area(self): + """ + area: The area cost (unit: mm2) + """ + area = self.mult.calculate_area() * self.number_of_multiplier + return area + + def calculate_delay(self): + """ + delay: The delay cost (unit: ns) + """ + delay = self.mult.calculate_delay() + return delay + + def calculate_energy(self): + """ + energy: The energy cost (unit: fJ) + """ + energy = self.mult.calculate_energy() * self.number_of_multiplier + return energy + + +class Adder: + def __init__(self, vdd: float, input_precision: int, unit_area: float, unit_delay: float, unit_cap: float): + """ + Class for a {input_precision}-b Carry-Ripple Adder + :param vdd: The supply voltage (unit: V) + :param input_precision: The bit precision of the input (unit: bit) + :param output_precision: The bit precision of the output (unit: bit) + :param number_of_1b_adder: The number of 1-b adder in the adder tree + """ + self.nand2 = UnitNand2(unit_area, unit_delay, unit_cap) + self.xor2 = UnitXor2(unit_area, unit_delay, unit_cap) + self.vdd = vdd + self.input_precision = input_precision + self.output_precision = input_precision + 1 + self.number_of_1b_adder = input_precision + + def calculate_area(self): + """ + area: The area cost (unit: mm2) + """ + area = (3*self.nand2.calculate_area() + 2*self.xor2.calculate_area())*self.number_of_1b_adder + return area + + def calculate_delay_lsb(self): + """ + delay: The delay cost for LSB (unit: ns) (best-case delay, also equals to the delay for Tsum of 1-b adder) + """ + delay_sum = 2*self.xor2.calculate_delay() # 2 XOR gate delay (A-to-Sum) + return delay_sum + + def calculate_delay_msb(self): + """ + delay: The delay cost for MSB (unit: ns) (worst-case delay) + """ + delay_carry = (self.xor2.calculate_delay() + 2*self.nand2.calculate_delay()) + (2*self.nand2.calculate_delay()) * (self.input_precision-1) # A-to-Cout -> Cin-to-Count * (precision-1) + return delay_carry + + def calculate_energy(self): + """ + energy: The energy cost (each time it is triggered) (unit: fJ) + """ + energy = (2*self.xor2.calculate_cap() + 3*self.nand2.calculate_cap()) * self.vdd**2 * self.number_of_1b_adder + return energy + + +class AdderTree: + def __init__(self, vdd: float, input_precision: int, number_of_input: int, unit_area: float, unit_delay: float, unit_cap: float): + """ + Class for a {input_number} {input_precision}-b Carry-Ripple Adder Tree + :param vdd: The supply voltage (unit: V) + :param input_precision: The bit precision of the input (unit: bit) + :param number_of_input: The number of inputs + :param output_precision: The bit precision of the output (unit: bit) + :param number_of_1b_adder: The number of 1-b adder in the adder tree + """ + if(math.log(number_of_input,2)%1 != 0): + raise ValueError("The number of input for the adder tree is not in the power of 2. Currently it is: %s" %number_of_input) + self.vdd = vdd + self.input_precision = input_precision + self.number_of_input = number_of_input + self.depth = int( math.log(number_of_input, 2) ) + self.output_precision = input_precision + self.depth + self.number_of_1b_adder = number_of_input*(input_precision+1)-(input_precision+self.depth+1) + self.unit_area = unit_area + self.unit_delay = unit_delay + self.unit_cap = unit_cap + + def calculate_area(self): + """ + area: The area cost (unit: mm2) + """ + # calculate area iteratively + # area_b = 0 + # for stage_idx in range(0, self.depth): + # single_adder = Adder(self.vdd, self.input_precision+stage_idx) + # area_b += single_adder.calculate_area() * math.ceil( self.number_of_input/(2**(stage_idx+1)) ) + # calculate area directly + area = self.number_of_1b_adder * Adder(vdd=self.vdd, input_precision=1, unit_area=self.unit_area, unit_delay=self.unit_delay, unit_cap=self.unit_cap).calculate_area() + return area + + def calculate_delay(self): + """ + delay: The delay cost (unit: ns) + """ + last_adder = Adder(vdd=self.vdd, input_precision=self.output_precision-1, unit_area=self.unit_area, unit_delay=self.unit_delay, unit_cap=self.unit_cap) + delay = last_adder.calculate_delay_lsb() * (self.depth-1) + last_adder.calculate_delay_msb() + return delay + + def calculate_energy(self): + """ + energy: The energy cost (each time it is triggered) (unit: fJ) + """ + energy = self.number_of_1b_adder * Adder(vdd=self.vdd, input_precision=1, unit_area=self.unit_area, unit_delay=self.unit_delay, unit_cap=self.unit_cap).calculate_energy() + return energy + + + + +class MemoryInstance: + """ + class for: regs (input regs, otuput regs), memory bank (copy from Zigzag code, with area, delay added) + """ + def __init__(self, name: str, size: int, r_bw: int, w_bw: int, delay: float, r_energy: float, w_energy: float, area: float, + r_port: int=1, w_port: int=1, rw_port: int=0, latency: int=1, + min_r_granularity=None, min_w_granularity=None): + """ + Collect all the basic information of a physical memory module. + :param name: memory module name, e.g. 'SRAM_512KB_BW_16b', 'I_RF' + :param size: total memory capacity (unit: bit) + :param r_bw/w_bw: memory bandwidth (or wordlength) (unit: bit/cycle) + :param delay: clock-to-output delay (unit: ns) + :param r_energy/w_energy: memory unit data access energy (unit: fJ) + :param area: memory area (unit: mm2) + :param r_port: number of memory read port + :param w_port: number of memory write port (rd_port and wr_port can work in parallel) + :param rw_port: number of memory port for both read and write (read and write cannot happen in parallel) + :param latency: memory access latency (unit: number of cycles) + """ + self.name = name + self.size = size + self.r_bw = r_bw + self.w_bw = w_bw + self.delay = delay + self.r_energy = r_energy + self.w_energy = w_energy + self.area = area + self.r_port = r_port + self.w_port = w_port + self.rw_port = rw_port + self.latency = latency + if not min_r_granularity: + self.r_bw_min = r_bw + else: + self.r_bw_min = min_r_granularity + if not min_w_granularity: + self.w_bw_min = w_bw + else: + self.w_bw_min = min_w_granularity + + def __jsonrepr__(self): + """ + JSON Representation of this class to save it to a json file. + """ + return self.__dict__ + + def __eq__(self, other: object) -> bool: + return isinstance(other, MemoryInstance) and self.__dict__ == other.__dict__ + + +################ + diff --git a/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation.py b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation.py new file mode 100755 index 00000000..8ace92be --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation.py @@ -0,0 +1,142 @@ +from dimc_validation_subfunc import dimc_cost_estimation + +""" +ISSCC2022, 15.5 (50% input toggle rate, 50% weight sparsity) +""" +dimc_ISSCC2022_15_5 = { # https://ieeexplore.ieee.org/document/9731762 (28nm) + 'paper_idx': 'ISSCC2022, 15.5', + 'input_toggle_rate': 0.5, + 'weight_sparsity': 0.5, + 'activation_precision': 8, + 'weight_precision': 8, + 'output_precision': 8, # output precision (unit: bit) + 'input_precision': 2, + 'input_channel': 32, # how many input in parallel (per bank) + 'output_channel': 6, # how many output in parallel (per bank) + 'booth_encoding': True, + 'multiplier_precision': 8, + 'adder_input_precision':9, + 'accumulator_input_precision': 14, + 'accumulator_precision':32, + 'reg_accumulator_precision': 32, + 'reg_input_bitwidth': 32*2, + 'pipeline': False, + 'vdd': 0.9, # V + 'rows': 32, # equal to the number of input channels + 'cols': 48, + 'banks': 64, # number of cores + 'area': 0.9408, # mm2 + 'tclk': 1/195*1000, # ns + 'TOP/s': 6144*195*(10**-6), + 'TOP/s/W': 36.63, + 'unit_area': 0, # um2 + 'unit_delay': 0, #ns + 'unit_cap': 0 #fF + } +cacti_ISSCC2022_15_5 = { # 256B, bw: 48 + 'delay': 0.0669052, #ns + 'r_energy': 0.000221196*10**6/64*81, #fJ + 'w_energy': 0.000328423*10**6/64*81, #fJ + 'area': 0.00065545 #mm2 + } + +""" +ISSCC2022, 11.7 (50% input sparsity, unknown weight sparsity, average performance reported) +""" +dimc_ISSCC2022_11_7 = { # https://ieeexplore.ieee.org/document/9731545 (28nm) + 'paper_idx': 'ISSCC2022, 11.7', + 'input_toggle_rate': 0.5, # assumption (this paper will not be used for energy validation) + 'weight_sparsity': 0.9, # assumption (this paper will not be used for energy validation) + 'activation_precision': 8, + 'weight_precision': 8, + 'output_precision': 21, + 'input_precision': 1, + 'input_channel': 32, # how many input in parallel (per bank) + 'output_channel': 1, # how many output in parallel (per bank) + 'booth_encoding': False, + 'multiplier_precision': 8, + 'adder_input_precision':16, + 'accumulator_input_precision': 8, + 'accumulator_precision':16, + 'reg_accumulator_precision': 16, + 'reg_input_bitwidth': 32, + 'pipeline': True, + 'reg_pipeline_precision':8, + 'vdd': 0.9, # V + 'rows': 32*16, # equal to the number of input channels + 'cols': 8*4, + 'banks': 2, # number of cores + 'area': 0.03, # mm2 + 'tclk': 3, # ns + 'TOP/s': 0.0054, + 'TOP/s/W': 22, + 'unit_area': 0, # um2 + 'unit_delay': 0, #ns + 'unit_cap': 0 #fF + } + +cacti_ISSCC2022_11_7 = { # 2048B, bw: 64 + 'delay': 0.0944664, #ns + 'r_energy': 0.5643*1000/64*81, #fJ + 'w_energy': 0.607*1000/64*81, #fJ + 'area': 0.00396 #mm2 + } + +""" +ISSCC2023, 7.2 (50% input sparsity, 50% weight sparsity) +""" +dimc_ISSCC2023_7_2 = { # https://ieeexplore.ieee.org/document/10067260/ + 'paper_idx': 'ISSCC2023, 7.2', + 'input_toggle_rate': 0.5, + 'weight_sparsity': 0.5, + 'activation_precision': 8, + 'weight_precision': 8, + 'output_precision': 23, + 'input_precision': 2, + 'input_channel': 128, # how many input in parallel (per bank) + 'output_channel': 8, # how many output in parallel (per bank) + 'booth_encoding': False, + 'multiplier_precision': 1, + 'adder_input_precision':2, + 'accumulator_input_precision': 17, + 'accumulator_precision':23, + 'reg_accumulator_precision': 23, + 'reg_input_bitwidth': 2, + 'pipeline': False, + 'reg_pipeline_precision':None, + 'vdd': 0.9, # V + 'rows': 64, + 'cols': 128, # equal to the number of input channels + 'banks': 8, # number of cores + 'area': 0.1462, # mm2 + 'tclk': 1000/182, # ns + 'TOP/s': None, + 'TOP/s/W': 19.5, + 'unit_area': 0, # um2 + 'unit_delay': 0, #ns + 'unit_cap': 0 #fF + } +cacti_value_ISSCC2023_7_2 = { # here I temporarily use: 1024 B, bw: 64 (no 128 in raw data) + 'delay': 0.0914947, #ns + 'r_energy': 0.401656*1000/64*81, #fJ + 'w_energy': 0.855128*1000/64*81, #fJ + 'area': 0.00193147 #mm2 + } + + +if __name__ == '__main__': + unit_area = 0.614 #um2 + unit_delay = 0.0478 #ns + unit_cap = 0.7 #fF + dimc_ISSCC2022_15_5['unit_area'] = unit_area #um2 + dimc_ISSCC2022_11_7['unit_area'] = unit_area #um2 + dimc_ISSCC2023_7_2['unit_area'] = unit_area #um2 + dimc_ISSCC2022_15_5['unit_delay'] = unit_delay #ns + dimc_ISSCC2022_11_7['unit_delay'] = unit_delay #ns + dimc_ISSCC2023_7_2['unit_delay'] = unit_delay #ns + dimc_ISSCC2022_15_5['unit_cap'] = unit_cap #fF + dimc_ISSCC2022_11_7['unit_cap'] = unit_cap #fF + dimc_ISSCC2023_7_2['unit_cap'] = unit_cap #fF + print(dimc_cost_estimation(dimc_ISSCC2022_15_5, cacti_ISSCC2022_15_5) ) + print(dimc_cost_estimation(dimc_ISSCC2022_11_7, cacti_ISSCC2022_11_7), 'Energy value does not make sense for this work (3rd value)') # no energy validation for this + print(dimc_cost_estimation(dimc_ISSCC2023_7_2, cacti_value_ISSCC2023_7_2)) diff --git a/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation4.py b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation4.py new file mode 100755 index 00000000..fcfc77cf --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation4.py @@ -0,0 +1,46 @@ +from dimc_validation_subfunc4 import dimc_cost_estimation4 + +""" +ISSCC2023, 16.3 (50% input sparsity, 50% weight sparsity) +""" +dimc_ISSCC2023_16_3 = { + 'paper_idx': 'ISSCC2023, 16.3', + 'input_toggle_rate': 0.5, + 'weight_sparsity': 0.5, + 'activation_precision': 8, + 'weight_precision': 8, + 'output_precision': 8, #not used + 'input_precision': 1, + 'input_channel': 128, # how many input in parallel (per bank) + 'output_channel': 8, # how many output in parallel (per bank) + 'booth_encoding': False, + 'multiplier_precision': 1, + 'adder_input_precision':4, + 'accumulator_input_precision': 9, + 'accumulator_precision':17, + 'reg_accumulator_precision': 17, + 'reg_input_bitwidth': 1, + 'pipeline': False, + 'reg_pipeline_precision':6, + 'vdd': 0.9, # V + 'rows': 128, + 'cols': 128, # equal to the number of input channels + 'banks': 4, # number of cores + 'area': 0.269, # mm2 + 'tclk': 1000/400, # ns + 'TOP/s': None, + 'TOP/s/W': 275, + 'unit_area': 0.614, # um2 + 'unit_delay': 0.0478, #ns + 'unit_cap': 0.7 #fF + } +cacti_value_ISSCC2023_16_3 = { # rows: 256, bw: 64 + 'delay': 0.0944664, #ns + 'r_energy': 0.000691128*1000/64*81, #fJ + 'w_energy': 0.00102207*1000/64*81, #fJ + 'area': 0.00416728 #mm2 + } + + +if __name__ == '__main__': + print(dimc_cost_estimation4(dimc_ISSCC2023_16_3, cacti_value_ISSCC2023_16_3)) diff --git a/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation_subfunc.py b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation_subfunc.py new file mode 100755 index 00000000..5ce77eab --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation_subfunc.py @@ -0,0 +1,176 @@ +from dimc_cost_model import UnitNand2, UnitDff, MultiplierArray, Adder, AdderTree, MemoryInstance + +def dimc_cost_estimation(dimc, cacti_value): + unit_reg = UnitDff(dimc['unit_area'], dimc['unit_delay'], dimc['unit_cap']) + unit_area = dimc['unit_area'] + unit_delay = dimc['unit_delay'] + unit_cap = dimc['unit_cap'] + input_channel = dimc['input_channel'] + reg_input_bitwidth = dimc['reg_input_bitwidth'] + input_bandwidth = input_channel * dimc['input_precision'] + output_bandwidth_per_channel = dimc['output_precision'] + """ + multiplier array for each output channel + """ + if dimc['booth_encoding'] == True: + mults = MultiplierArray(vdd=dimc['vdd'],input_precision=int(dimc['multiplier_precision']),number_of_multiplier=input_channel*dimc['input_precision']/2, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + else: + mults = MultiplierArray(vdd=dimc['vdd'],input_precision=int(dimc['multiplier_precision']),number_of_multiplier=input_channel*dimc['input_precision'], unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + adder_tree for each output channel + """ + adder_tree = AdderTree(vdd=dimc['vdd'], input_precision=int(dimc['adder_input_precision']), number_of_input=input_channel, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + accumulator for each output channel + """ + accumulator = Adder(vdd=dimc['vdd'], input_precision=int(dimc['accumulator_precision']), unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + memory instance (delay unit: ns, energy unit: fJ, area unit: mm2) + unitbank: sram bank, data from CACTI + regs_input: input register files + regs_output: output register files for each output channel + regs_accumulator: register files inside accumulator for each output channel (congifuration is same with regs_output) + """ + # bank delay is neglected for delay validation (due to small contribution; and RBL delay is also included, therefore discrepancy exists. + unitbank = MemoryInstance(name='unitbank', size=dimc['rows']*dimc['cols'], r_bw=dimc['cols'], w_bw=dimc['cols'], delay=0, r_energy=cacti_value['r_energy'], w_energy=cacti_value['w_energy'], area=cacti_value['area'], r_port=1, w_port=1, rw_port=0, latency=0) + regs_input = MemoryInstance(name='regs_input', size=reg_input_bitwidth, r_bw=reg_input_bitwidth, w_bw=reg_input_bitwidth, delay=unit_reg.calculate_delay(), r_energy=0, w_energy=unit_reg.calculate_cap() * dimc['vdd']**2 * reg_input_bitwidth, area=unit_reg.calculate_area()*reg_input_bitwidth, r_port=1, w_port=1, rw_port=0, latency=1) + regs_output = MemoryInstance(name='regs_output',size=output_bandwidth_per_channel, r_bw=output_bandwidth_per_channel, w_bw=output_bandwidth_per_channel, delay=unit_reg.calculate_delay(), r_energy=0, w_energy=unit_reg.calculate_cap() * dimc['vdd']**2 * output_bandwidth_per_channel, area=unit_reg.calculate_area()*output_bandwidth_per_channel, r_port=1, w_port=1, rw_port=0, latency=1) + regs_accumulator = MemoryInstance(name='regs_accumulator', size=dimc['reg_accumulator_precision'], r_bw=dimc['reg_accumulator_precision'], w_bw=dimc['reg_accumulator_precision'], delay=unit_reg.calculate_delay(), r_energy=0, w_energy=unit_reg.calculate_cap() * dimc['vdd']**2 * dimc['reg_accumulator_precision'], area=unit_reg.calculate_area()*dimc['reg_accumulator_precision'], r_port=1, w_port=1, rw_port=0, latency=0) + # pipeline after adder tree and before accumulator + if dimc['pipeline'] == True: + pipeline_bw_per_channel = dimc['reg_pipeline_precision'] + regs_pipeline = MemoryInstance(name='regs_pipeline', size=pipeline_bw_per_channel, r_bw=pipeline_bw_per_channel, w_bw=pipeline_bw_per_channel, delay=unit_reg.calculate_delay(), r_energy=0, w_energy=unit_reg.calculate_cap() * dimc['vdd']**2 * pipeline_bw_per_channel, area=unit_reg.calculate_area()*pipeline_bw_per_channel, r_port=1, w_port=1, rw_port=0, latency=1) + else: + regs_pipeline = MemoryInstance(name='regs_pipeline', size=0, r_bw=0, w_bw=0, delay=0, r_energy=0, w_energy=0, area=0, r_port=1, w_port=1, rw_port=0, latency=0) + + ################### special cost for each paper ################################## + """ + special cost for ISSCC2023, 7.2: adder tree across output channels + """ + if dimc['paper_idx'] == 'ISSCC2023, 7.2': + adder_tree_channel = AdderTree(vdd=dimc['vdd'], input_precision=16, number_of_input=8, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + ################################################################################## + + """ + calculate result + :predicted_area: The area cost for entire IMC core (unit: mm2) + :predicted_delay: The minimum delay of single clock period (unit: ns) + :predicted_energy_per_cycle: The energy cost each time the IMC core is activated (unit: fJ) + :number_of_cycle: The number of cycle for computing entire input + :predicted_energy: The energy cost for computing entire input (unit: fJ) + :number_of_operations: The number of operations executed when computing entire input + :predicted_tops: Peak TOP/s + :predicted_topsw: Peak TOP/s/W + """ + + ## Area cost breakdown + area_mults = dimc['banks'] * dimc['output_channel'] * mults.calculate_area() + area_adder_tree = dimc['banks'] * dimc['output_channel'] * adder_tree.calculate_area() + area_accumulator = dimc['banks'] * dimc['output_channel'] * accumulator.calculate_area() + area_banks = dimc['banks'] * unitbank.area + area_regs_input = dimc['banks'] * regs_input.area + area_regs_output = dimc['banks'] * dimc['output_channel'] * regs_output.area + area_regs_accumulator = dimc['banks'] * dimc['output_channel'] * regs_accumulator.area + area_regs_pipeline = dimc['banks'] * dimc['output_channel'] * regs_pipeline.area + + if dimc['paper_idx'] == 'ISSCC2022, 15.5': # extra area cost for supporting FP operation + extra_adder_tree = AdderTree(vdd=dimc['vdd'], input_precision=32, number_of_input=2, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + extra_accumulator = Adder(vdd=dimc['vdd'], input_precision=32, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + extra_regs_accumulator = MemoryInstance(name='extra_regs_accumulator', size=32, r_bw=32, w_bw=32, delay=0, r_energy=0, w_energy=0, area=1.764/(10**6)*32, r_port=1, w_port=1, rw_port=0, latency=0) + area_extra_adder_tree = dimc['banks'] * 5 * extra_adder_tree.calculate_area() + area_extra_accumulator = dimc['banks'] * 5 * extra_accumulator.calculate_area() + area_extra_regs_accumulator = dimc['banks'] * 5 * extra_regs_accumulator.area + + area_adder_tree += area_extra_adder_tree + area_accumulator += area_extra_accumulator + area_regs_accumulator += area_extra_regs_accumulator + + if dimc['paper_idx'] == 'ISSCC2022, 11.7': + area_accumulator = dimc['banks'] * dimc['output_channel'] * dimc['input_channel'] * accumulator.calculate_area() + area_regs_accumulator = dimc['banks'] * dimc['output_channel'] * dimc['input_channel'] * regs_accumulator.area + area_regs_input = regs_input.area # input regs are shared across banks + area_regs_pipeline = dimc['banks'] * dimc['output_channel'] * dimc['input_channel'] * regs_pipeline.area + if dimc['paper_idx'] == 'ISSCC2023, 7.2': + area_adder_tree_channel = dimc['banks'] * adder_tree_channel.calculate_area() + area_adder_tree += area_adder_tree_channel + area_accumulator = dimc['banks'] * accumulator.calculate_area() + area_regs_output = dimc['banks'] * regs_output.area + area_regs_accumulator = dimc['banks'] * regs_accumulator.area + + predicted_area = area_mults + area_adder_tree + area_accumulator + area_banks + area_regs_input * 0 + area_regs_output * 0 + area_regs_accumulator + area_regs_pipeline # cost of input/output regs has been taken out + + ## Minimum clock time + adder_1b_carry_delay = 2*UnitNand2(unit_area, unit_delay, unit_cap).calculate_delay() + accumulator_delay = accumulator.calculate_delay_lsb()+adder_1b_carry_delay * (dimc['reg_accumulator_precision']-dimc['accumulator_input_precision']) + if dimc['pipeline'] == True: + if dimc['paper_idx'] == 'ISSCC2022, 11.7': # for dimc2 + #predicted_delay = max(unitbank.delay + mults.calculate_delay(), adder_tree.calculate_delay() + accumulator.calculate_delay_msb()) + accumulator_delay = accumulator.calculate_delay_lsb() + predicted_delay = max(unitbank.delay + mults.calculate_delay(), adder_tree.calculate_delay() + accumulator_delay) + else: + #predicted_delay = max(unitbank.delay + mults.calculate_delay() + adder_tree.calculate_delay(), accumulator.calculate_delay_msb()) + predicted_delay = max(unitbank.delay + mults.calculate_delay() + adder_tree.calculate_delay(), accumulator_delay) + else: + if dimc['paper_idx'] == 'ISSCC2023, 7.2': # for dimc3 + #predicted_delay = unitbank.delay + mults.calculate_delay() + adder_tree.calculate_delay() + accumulator.calculate_delay_msb() + adder_tree_channel.calculate_delay() + predicted_delay = unitbank.delay + mults.calculate_delay() + adder_tree.calculate_delay() + accumulator_delay + adder_tree_channel.calculate_delay() + else: # for dimc1 + #predicted_delay = unitbank.delay + mults.calculate_delay() + adder_tree.calculate_delay() + accumulator.calculate_delay_msb() + predicted_delay = unitbank.delay + mults.calculate_delay() + adder_tree.calculate_delay() + accumulator_delay + + ## Energy cost breakdown per cycle + energy_mults = dimc['input_toggle_rate'] * dimc['weight_sparsity'] * dimc['banks'] * dimc['output_channel'] * mults.calculate_energy() + energy_adder_tree = dimc['input_toggle_rate'] * dimc['weight_sparsity'] * dimc['banks'] * dimc['output_channel'] * adder_tree.calculate_energy() + energy_accumulator = dimc['banks'] * dimc['output_channel'] * accumulator.calculate_energy() + energy_banks = dimc['banks'] * unitbank.r_energy * 0 # make it to zero because: (1) from validation, this cost is very small in percentage to entire macro energy; (2) papaers don't report how many cycles they will read out the data once. + energy_regs_input = dimc['banks'] * regs_input.w_energy + energy_regs_output = dimc['banks'] * dimc['output_channel'] * regs_output.w_energy + energy_regs_accumulator = dimc['banks'] * dimc['output_channel'] * regs_accumulator.w_energy + energy_regs_pipeline = dimc['banks'] * dimc['output_channel'] * regs_pipeline.w_energy + + if dimc['paper_idx'] == 'ISSCC2022, 11.7': + pass + if dimc['paper_idx'] == 'ISSCC2023, 7.2': + energy_adder_tree_channel = dimc['banks'] * adder_tree_channel.calculate_energy() + energy_adder_tree += energy_adder_tree_channel + energy_accumulator = dimc['banks'] * accumulator.calculate_energy() + energy_regs_output = dimc['banks'] * regs_output.w_energy + energy_regs_accumulator = dimc['banks'] * regs_accumulator.w_energy + + predicted_energy_per_cycle = energy_mults + energy_adder_tree + energy_accumulator + energy_banks + energy_regs_accumulator + energy_regs_pipeline # + energy_regs_input + energy_regs_output + + number_of_cycle = dimc['activation_precision']/dimc['input_precision'] + + predicted_energy = predicted_energy_per_cycle * number_of_cycle + + number_of_operations = 2*dimc['banks']*dimc['output_channel']*dimc['input_channel'] # 1MAC = 2 Operations + if dimc['paper_idx'] == 'ISSCC2023, 7.2': + number_of_operations = 2*dimc['banks']*dimc['output_channel']*dimc['input_channel']/dimc['weight_precision'] # 1MAC = 2 Operations + + predicted_tops = number_of_operations/(predicted_delay*number_of_cycle) / (10**3) + predicted_topsw = number_of_operations/predicted_energy * 10**3 + + ## Energy breakdown per MAC + number_of_mac = number_of_operations/2 + energy_mults_mac = energy_mults * number_of_cycle/number_of_mac + energy_adder_tree_mac = energy_adder_tree * number_of_cycle/number_of_mac + energy_accumulator_mac = energy_accumulator * number_of_cycle/number_of_mac + energy_banks_mac = energy_banks * number_of_cycle/number_of_mac + # energy_regs_input_mac = energy_regs_input * number_of_cycle/number_of_mac + # energy_regs_output_mac = energy_regs_output * number_of_cycle/number_of_mac + energy_regs_accumulator_mac = energy_regs_accumulator * number_of_cycle/number_of_mac + energy_regs_pipeline_mac = energy_regs_pipeline * number_of_cycle/number_of_mac + energy_estimation_per_mac = predicted_energy/number_of_mac + energy_reported_per_mac = 2000/dimc['TOP/s/W'] + + area_mismatch = abs(predicted_area/dimc['area']-1) + delay_mismatch = abs(predicted_delay/dimc['tclk']-1) + energy_mismatch = abs(energy_estimation_per_mac/energy_reported_per_mac-1) + return area_mismatch, delay_mismatch, energy_mismatch + #print(area_mults, area_adder_tree, area_accumulator+area_regs_accumulator, area_banks, area_regs_pipeline) + #print(energy_mults_mac, energy_adder_tree_mac, energy_accumulator_mac+energy_regs_accumulator_mac, energy_banks_mac, energy_regs_pipeline_mac) + # return predicted_area, predicted_delay, predicted_energy/number_of_operations \ No newline at end of file diff --git a/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation_subfunc4.py b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation_subfunc4.py new file mode 100755 index 00000000..7053ffa5 --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/dimc_validation_subfunc4.py @@ -0,0 +1,98 @@ +from dimc_cost_model import UnitNand2, UnitDff, MultiplierArray, Adder, AdderTree, MemoryInstance + +def dimc_cost_estimation4(dimc, cacti_value): + unit_reg = UnitDff(dimc['unit_area'], dimc['unit_delay'], dimc['unit_cap']) + unit_area = dimc['unit_area'] + unit_delay = dimc['unit_delay'] + unit_cap = dimc['unit_cap'] + input_channel = dimc['input_channel'] + """ + multiplier array for each output channel + """ + mults = MultiplierArray(vdd=dimc['vdd'],input_precision=int(dimc['multiplier_precision']),number_of_multiplier=input_channel*dimc['input_precision'], unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + adder_tree (1/3) for each output channel + """ + + adder_tree1 = AdderTree(vdd=dimc['vdd'], input_precision=1, number_of_input=16, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + adder_tree2 = AdderTree(vdd=dimc['vdd'], input_precision=4, number_of_input=8, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + adder_tree3 = AdderTree(vdd=dimc['vdd'], input_precision=6, number_of_input=8, unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + accumulator for each output channel + """ + accumulator = Adder(vdd=dimc['vdd'], input_precision=dimc['accumulator_precision'], unit_area=unit_area, unit_delay=unit_delay, unit_cap=unit_cap) + + """ + memory instance (delay unit: ns, energy unit: fJ, area unit: mm2) + unitbank: sram bank, data from CACTI + """ + unitbank = MemoryInstance(name='unitbank', size=dimc['rows']*dimc['cols'], r_bw=dimc['cols'], w_bw=dimc['cols'], delay=cacti_value['delay']*0, r_energy=cacti_value['r_energy'], w_energy=cacti_value['w_energy'], area=cacti_value['area'], r_port=1, w_port=1, rw_port=0, latency=0) + regs_accumulator = MemoryInstance(name='regs_accumulator', size=dimc['reg_accumulator_precision'], r_bw=dimc['reg_accumulator_precision'], w_bw=dimc['reg_accumulator_precision'], delay=unit_reg.calculate_delay(), r_energy=0, w_energy=unit_reg.calculate_cap() * dimc['vdd']**2 * dimc['reg_accumulator_precision'], area=unit_reg.calculate_area()*dimc['reg_accumulator_precision'], r_port=1, w_port=1, rw_port=0, latency=0) + regs_pipeline = MemoryInstance(name='regs_accumulator', size=dimc['reg_pipeline_precision'], r_bw=dimc['reg_pipeline_precision'], w_bw=dimc['reg_pipeline_precision'], delay=unit_reg.calculate_delay(), r_energy=0, w_energy=unit_reg.calculate_cap() * dimc['vdd']**2 * dimc['reg_pipeline_precision'], area=unit_reg.calculate_area()*dimc['reg_pipeline_precision'], r_port=1, w_port=1, rw_port=0, latency=0) + + """ + calculate result + :predicted_area: The area cost for entire IMC core (unit: mm2) + :predicted_delay: The minimum delay of single clock period (unit: ns) + :predicted_energy_per_cycle: The energy cost each time the IMC core is activated (unit: fJ) + :number_of_cycle: The number of cycle for computing entire input + :predicted_energy: The energy cost for computing entire input (unit: fJ) + :number_of_operations: The number of operations executed when computing entire input + :predicted_tops: Peak TOP/s + :predicted_topsw: Peak TOP/s/W + """ + + ## Area cost breakdown + area_mults = dimc['banks'] * dimc['output_channel'] * mults.calculate_area() + area_adder_tree = dimc['banks'] * dimc['output_channel'] * ( 8*8*adder_tree1.calculate_area() + 8*adder_tree2.calculate_area() + adder_tree3.calculate_area() ) + area_regs_pipeline = dimc['banks'] * dimc['output_channel'] * 8*regs_pipeline.area + area_accumulator = dimc['banks'] * dimc['output_channel'] * accumulator.calculate_area() + area_banks = dimc['banks'] * unitbank.area + area_regs_accumulator = dimc['banks'] * dimc['output_channel'] * regs_accumulator.area + + predicted_area = area_mults + area_adder_tree + area_regs_pipeline + area_accumulator + area_banks + area_regs_accumulator # cost of input/output regs is not taken out + + ## Minimum clock time + adder_1b_carry_delay = 2*UnitNand2(unit_area, unit_delay, unit_cap).calculate_delay() + accumulator_delay = accumulator.calculate_delay_lsb()+adder_1b_carry_delay * (dimc['reg_accumulator_precision']-dimc['accumulator_input_precision']) + #predicted_delay = max(unitbank.delay + mults.calculate_delay() + adder_tree1.calculate_delay() + adder_tree2.calculate_delay(), adder_tree3.calculate_delay() + accumulator.calculate_delay_msb()) + predicted_delay = max(unitbank.delay + mults.calculate_delay() + adder_tree1.calculate_delay() + adder_tree2.calculate_delay(), adder_tree3.calculate_delay() + accumulator_delay) + + ## Energy cost breakdown per cycle + energy_mults = dimc['input_toggle_rate'] * dimc['banks'] * dimc['output_channel'] * mults.calculate_energy() # fJ + energy_adder_tree = dimc['input_toggle_rate'] * dimc['weight_sparsity'] * dimc['banks'] * dimc['output_channel'] * ( 8*8*adder_tree1.calculate_energy() + 8*adder_tree2.calculate_energy() + adder_tree3.calculate_energy() ) # fJ + energy_accumulator = dimc['banks'] * dimc['output_channel'] * accumulator.calculate_energy() + energy_banks = 0 # make it to zero because: (1) from validation, this cost is very small in percentage to entire macro energy; (2) papaers don't report how many cycles they will read out the data once. + energy_regs_accumulator = dimc['banks'] * dimc['output_channel'] * regs_accumulator.w_energy + energy_regs_pipeline = dimc['banks'] * dimc['output_channel'] * 8*regs_pipeline.w_energy + + predicted_energy_per_cycle = energy_mults + energy_adder_tree + energy_accumulator + energy_banks + energy_regs_accumulator + energy_regs_pipeline + + number_of_cycle = dimc['activation_precision']/dimc['input_precision'] + + predicted_energy = predicted_energy_per_cycle * number_of_cycle + + number_of_operations = 2*dimc['banks']*dimc['output_channel']*dimc['input_channel'] # 1MAC = 2 Operations + predicted_tops = number_of_operations/(predicted_delay*number_of_cycle) / (10**3) + predicted_topsw = number_of_operations/predicted_energy * 10**3 + + ## Energy breakdown per MAC + number_of_mac = number_of_operations/2 + energy_mults_mac = energy_mults * number_of_cycle/number_of_mac + energy_adder_tree_mac = energy_adder_tree * number_of_cycle/number_of_mac + energy_accumulator_mac = energy_accumulator * number_of_cycle/number_of_mac + energy_banks_mac = energy_banks * number_of_cycle/number_of_mac + energy_regs_accumulator_mac = energy_regs_accumulator * number_of_cycle/number_of_mac + energy_regs_pipeline_mac = energy_regs_pipeline * number_of_cycle/number_of_mac + energy_estimation_per_mac = predicted_energy/number_of_mac + energy_reported_per_mac = 2000/dimc['TOP/s/W'] + + area_mismatch = abs(predicted_area/dimc['area']-1) + delay_mismatch = abs(predicted_delay/dimc['tclk']-1) + energy_mismatch = abs(energy_estimation_per_mac/energy_reported_per_mac-1) + return area_mismatch, delay_mismatch, energy_mismatch + print(area_mults, area_adder_tree, area_accumulator+area_regs_accumulator, area_banks, area_regs_pipeline) + print(energy_mults_mac, energy_adder_tree_mac, energy_accumulator_mac+energy_regs_accumulator_mac, energy_banks_mac, energy_regs_pipeline_mac) + # return predicted_area, predicted_delay, predicted_energy/number_of_operations \ No newline at end of file diff --git a/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/model_extration_28nm.py b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/model_extration_28nm.py new file mode 100755 index 00000000..bfea8c43 --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/dimc_validation/28nm/model_extration_28nm.py @@ -0,0 +1,68 @@ +from dimc_validation import dimc_ISSCC2022_15_5, cacti_ISSCC2022_15_5, dimc_ISSCC2022_11_7, cacti_ISSCC2022_11_7, dimc_ISSCC2023_7_2, cacti_value_ISSCC2023_7_2 +from dimc_validation4 import dimc_ISSCC2023_16_3, cacti_value_ISSCC2023_16_3 +from dimc_validation_subfunc4 import dimc_cost_estimation4 +from dimc_validation_subfunc import dimc_cost_estimation + +def area_fitting(): + mismatch = 1 + for area in range(294, 1000, 1): + dimc_ISSCC2022_15_5['unit_area'] = area/1000 #um2 + dimc_ISSCC2022_11_7['unit_area'] = area/1000 #um2 + dimc_ISSCC2023_7_2['unit_area'] = area/1000 #um2 + dimc_ISSCC2023_16_3['unit_area'] = area/1000 #um2 + a1, d1, e1 = dimc_cost_estimation(dimc_ISSCC2022_15_5, cacti_ISSCC2022_15_5) + a2, d2, e2 = dimc_cost_estimation(dimc_ISSCC2022_11_7, cacti_ISSCC2022_11_7) + a3, d3, e3 = dimc_cost_estimation(dimc_ISSCC2023_7_2, cacti_value_ISSCC2023_7_2) + a4, d4, e4 = dimc_cost_estimation4(dimc_ISSCC2023_16_3, cacti_value_ISSCC2023_16_3) + at = (a1+a2+a3+a4)/4 # average area mismatch + dt = (d1+d2+d3+d4)/4 # average delay mismatch + et = (e1+e3)/2 # average energy mismatch (peak energy is not reported in paper2) + if at < mismatch: + mismatch = at + fitted_unit_area = area/1000 + print(f"fitted_unit_area: {fitted_unit_area}, average_mismatch: {mismatch}") + return mismatch, fitted_unit_area + +def delay_fitting(): + mismatch = 1 + for delay in range(150, 500, 1): + dimc_ISSCC2022_15_5['unit_delay'] = delay/10000 #ns + dimc_ISSCC2022_11_7['unit_delay'] = delay/10000 #ns + dimc_ISSCC2023_7_2['unit_delay'] = delay/10000 #ns + dimc_ISSCC2023_16_3['unit_delay'] = delay/10000 #ns + a1, d1, e1 = dimc_cost_estimation(dimc_ISSCC2022_15_5, cacti_ISSCC2022_15_5) + a2, d2, e2 = dimc_cost_estimation(dimc_ISSCC2022_11_7, cacti_ISSCC2022_11_7) + a3, d3, e3 = dimc_cost_estimation(dimc_ISSCC2023_7_2, cacti_value_ISSCC2023_7_2) + a4, d4, e4 = dimc_cost_estimation4(dimc_ISSCC2023_16_3, cacti_value_ISSCC2023_16_3) + at = (a1+a2+a3+a4)/4 # average area mismatch + dt = (d1+d2+d3+d4)/4 # average delay mismatch + et = (e1+e3)/2 # average energy mismatch (peak energy is not reported in paper2) + if dt < mismatch: + mismatch = dt + dlist = [d1,d2,d3,d4] + fitted_unit_delay = delay/10000 + print(f"fitted_unit_delay: {fitted_unit_delay}, average_mismatch: {mismatch}") + return mismatch, fitted_unit_delay + +def cap_fitting(): + mismatch = 1 + for cap in range(1, 50, 1): + dimc_ISSCC2022_15_5['unit_cap'] = cap/10 #fF + dimc_ISSCC2022_11_7['unit_cap'] = cap/10 #fF + dimc_ISSCC2023_7_2['unit_cap'] = cap/10 #fF + a1, d1, e1 = dimc_cost_estimation(dimc_ISSCC2022_15_5, cacti_ISSCC2022_15_5) + a2, d2, e2 = dimc_cost_estimation(dimc_ISSCC2022_11_7, cacti_ISSCC2022_11_7) + a3, d3, e3 = dimc_cost_estimation(dimc_ISSCC2023_7_2, cacti_value_ISSCC2023_7_2) + at = (a1+a2+a3)/3 # average area mismatch + dt = (d1+d2+d3)/3 # average delay mismatch + et = (e1+e3)/2 # average energy mismatch (peak energy is not reported in paper2) + if et < mismatch: + mismatch = et + fitted_unit_cap = cap/10 + print(f"fitted_unit_cap: {fitted_unit_cap}, average_mismatch: {mismatch}") + return mismatch, fitted_unit_cap + +if __name__ == '__main__': + area_fitting() + delay_fitting() + cap_fitting() diff --git a/zigzag/inputs/validation/hardware/sram_imc/imc_validation_hw_architectures.svg b/zigzag/inputs/validation/hardware/sram_imc/imc_validation_hw_architectures.svg new file mode 100755 index 00000000..fb6f309d --- /dev/null +++ b/zigzag/inputs/validation/hardware/sram_imc/imc_validation_hw_architectures.svg @@ -0,0 +1,13328 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Paper summary + + aimc1 + + paper idx + publisher + link + CICC2021 + https://ieeexplore.ieee.org/document/9431575 + aimc2 + JSSC2023(ISSCC2021, 16.3) + https://ieeexplore.ieee.org/document/9896828/(https://ieeexplore.ieee.org/document/9365984) + aimc3 + ISSCC20237.8 + https://ieeexplore.ieee.org/document/10067289 + + + + + + + Tech (nm) + 22 + 28 + 22 + Voltage (V) + 0.8 + 0.9 + 0.8 + Input feature + time pulse + voltage level + voltage level + Accumulation feature + charge sharing + charge sharing + cap coupling + DAC precision + 7 + 2 + None + ADC precision + 6 + 5 + 3*1 + Di + 1024 + 16 + 64*7 + Do + 512 + 12 + 256 + #macro + 1 + 4 + 8 + M + 1 + 32 + 1 + *1: effective Di and ADC precision in [4], because it uses threshold detecting scheme for ADC. Even though its ADC resolution is 7b, the value it reports is 3b (this is a guess. 7b ADC is not possible in this short Tclk Compared to [2] and [3].) + Bi/Bw/Bo + 8/8/20 + 8/8/24 + Tclk (ns) + 7.2 + Area (mm2) + 0.468 + fJ/MAC + #MAC/cycle + ICHxOCHx#macro*3 + AIMC papers + *2: sparsity unknown + 7/2/6 + 1000/22.5 + 1.9425 + 2000/1050 + ICHxOCHx#macro + 2000/15.02 + *3: seems in [2] and [3], 8b input will transfer 2 times, but happens in 1 cycle (4b input takes 4.2 ns, while 8b input takes 8.4 ns in [3]) + 2000/18.7*4 + *4: [4] uses delta input, so energy probably is much lower than normal. (seems not affect model fitting.) + 1000/364 + 1.88 + sparsity + Sparw: 50%Spari: 37.5% + ICHxOCHx#macro + Sparw: 0%*5Spari: 0% + *5: sparsity is not reported in paper. This is an assumption. + + *6: there are 64 Di, but only 16 are activated in paper. + Sparw: NASpari: NA + *7: for aimc3, due to it uses cap-coupling summation method, it's effective Di is 1 when calculating ADC's delay. + + + dimc1 + + paper idx + publisher + link + ISSCC202215.5 + https://ieeexplore.ieee.org/document/9731762 + dimc2 + ISSCC202211.7 + https://ieeexplore.ieee.org/document/9731545 + dimc3 + ISSCC20237.2 + + + + + + + Tech (nm) + 28 + 28 + 28 + Voltage (V) + 0.9 + Di + 32 + 32 + 128 + Do + 6 + 1 + 8 + #macro + 64 + 2 + 8 + M + 1 + 16 + 8 + Bi/Bw/Bo + 8/8/21 + 8/8/23 + Tclk (ns) + 3 + Area (mm2) + 0.03 + fJ/MAC + #MAC/cycle + ICHxOCHx#macro + DIMC papers + 8/8/8 + 1000/195 + 0.9408 + 2000/36.63 + ICHxOCHx#macro + 2000/22 + 2000/19.5 + 1000/182 + 0.1462 + sparsity + ICHxOCHx#macro + Sparw: 50%Spari: 50% + + 0.9 + 0.9 + Sparw: NASpari: NA + https://ieeexplore.ieee.org/document/10067260/ + Sparw: 50%Spari: 50% + + dimc4 + ISSCC202316.3 + https://ieeexplore.ieee.org/document/10067779 + 28 + 0.9 + 128 + 8 + 4 + 2 + 8/8/8 + 1000/400 + 0.269 + 2000/275 + ICHxOCHx#macro + Sparw: NASpari: NA + + + + aimc1 (22nm)CICC(IMEC) + + 1 + + + 6T + act_precision=7(1 cycles) + + W[1] + + 6T + W[0] + Ternary + 7b + + DAC + + + + + + + + + + + + + + + + + + + ... + 1024 + + + + ADC + + 6b + ... + 512 + + + 6b + 7b + + DAC + + 7b + + DAC + + + 7b + + DAC + + 7b + + DAC + + + 7b + + DAC + + ... + 1024 + + + + + aimc2 (28nm)JSSC2023, MarchMeng-fan Chang + + 4 + + + + ... + 32 + + + + + + + + + + + + + + + 2b + + 2b + 6T + act_precision=8(transfer 2 times/cycle) + + + ... + 32 + + + + + + + + + + + + + + 6T + + + ... + 32 + + + + + + + + + + + + + + 6T + ... + 8 + W[0] + W[1] + W[7] + + 2-to-1 MUX + + + ... + 16 + + + + + ... + 16 + + + + + + ... + 16 + + + + + + ADC + + ADC + + ADC + + ADC + ... + 16 + + 5b + + + + + + + + + + + + + + + 20b + + + 20b + + ... + 12 + ... + 12 + + 20b + + + + 4b + ... + ICH:16 + + 4b + + + + 2b + 2b + + DAC + + + DAC + + + DAC + + DAC + chargesharing + + + + + ... + 8 + + + + + + + + + + + + + + + + + pipeline + + + + + + + + + + 7b + + + + + + + + 7b + + + + + + + + 7b + ... + 8 + + + + + + + + 9b + + + + + + + + 9b + ... + 4 + + + + + + + + 12b + + + + + + + + 12b + + + + + + + + 15b + 16b + equivelent adder tree + adding together according to place value + this adder treeis not shown inpaper. It's my guess. + + + 20b + + + + + aimc3 (22nm)ISSCC2023 + + 8 + + act_precision=8(bit-series) + + + 6T + W[0] + 1b + + + + + + + + + + ... + 64 + + + ADC + + 7b + ... + 256 + + + 6b + 1b + + 1b + + 64 + + + + + + ... + + + + + + 6b + + 6b + trancated + result is not accumulated together in IMC macros. + + add a constant value + + + + + 1 + dimc4 (28nm)ISSCC2023, 16.3 + + w0 + + ... + + + + + + + + + 8b input + 1b + + ... + + 128 + + 1b + + 1b + ... + 6T + + + + + + 1b + + + + + + + + + + + + + + ... + + + + + + + + ... + + + + + + + + + + + + + + 6b + + + + + + + + + + + + + + + + 17b + 17b + + 17b + + + ... + 8 + + + + + 6bx8 + + + + + + + + + + + + + + + + + + + + + + + 6b + 9b + + 9b + ... + 8 + 2 + + + + + dimc3 (28nm)ISSCC2023, 7.2 + weight_precision: 8 + Note: this paper supports FP. + + + + ... + + 8 + + 1b + + + + + + + + + + + + 2b + + + ... + + + + + + + 2b + + + + + + ... + + + + + + + + + + + + + + + + ... + + + + + + + + + + + + + + + + + + + + + + + + + + w0[7] + 128 + ... + 128 + + + ... + 8 + + w0[0] + + 9b + + 9b + + 9b + ... + + + << + + + 16b + 17b for signed INT + + + << + + + + + << + + + 16b + 16b + + + + + + + + + + + + + + + + ... + 8 + + + + + + + + + + + + + + + + + + + + + ... + + 17b + + + + + + + + accumulator + + + + + + + + + + + + + + 23b + 23b + + + + + 23b + 23b + + ... + + + 8 + input channel: 1 + + + + + + 23b + + + + + + 23b + + + + ... + + 8 + + 1b + + + + + + + + + + + + + + + + + + + + + + ... + + + + + + + + + + + + + + + + ... + + + + + + + + + + + + + + + + w0[7] + 128 + + 9b + + + 2b + 2b + ... + + + 2b + + 2b + + + + + + + + + + ... + 8 + + + 2b + 2b + + + ... + input channel: 128 + + + + + w0[0] + + + << + + + 16b + + + + << + + + 16b + + + + << + + + 16b + 9b + 9b + 9b + + + + + + + + + + + + + + + + ... + 8 + + + + + + + + + + + + + + + + + + + + + ... + + 17b + + + + + + accumulator + + + + + + + + + 23b + 23b + 23b + + + + + + 23b + 8 + + + + + + 2b + ... + approximate multiplier is used for w0[5-0] + + + + 8 + change drawing, for easy understanding + + + dimc1 (28nm)ISSCC2022, 15.5 + input channel: 32 + + + 64 + 2b + booth encoding + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + weight precision: 8 + + + ... + + + 32 + + + + + + + 9b + + + + + + + 9b + ... + + + + + + + 9b + + + + + + + + + + + ... + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 32b + 32b + 14b + 32b + + + + + 32b + + + + + + 32b + + ... + + + + + + 32b + + 6 + ... + 6 + accumulator + + + dimc2 (28nm)ISSCC2022, 11.7 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + weight precision: 8 + + + ... + + input channel: 32 + 32 + + + + + + 16b + + + + + + ... + + + + + + sum32 + + + + + + + + + + + ... + + + + + + + + + + + + + + + + + + + + + + + + + 16b + 16b + 16b + 8b + accumulator + + ... + 4 + + + + MUX + + + + + + + + + + 8b + + + + + + + 8b + + + + + + 8b + + + + + 16b + 16b + ... + ... + 21b + + + + + 21b + + + 2 + + + input channel: ICH + Bi + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + weight precision: Bw + + + ... + + + ICH + + + + + + + Bw + + + + + + + Bw + ... + + + + + + + + + + + + + + + + + ... + + + + + + + + + + + + + + + + + + + + + + + + + + + Bi+Ba+log2(ICH) + Bi+log2(Bi×ICH) + + + + + + + + + + + ... + + + + + + + OCH + ... + OCH + accumulator + + + + + + + + + + + + + + + + + + + + + Bw + Bi×ICH + Bo (e.g, 8) + + + Architecture summary + Peak performance analysis (single macro) + AIMC + fJ/MAC + multiplier + ADC + adders + weight cell + accumulators + DAC + 0.5CgateVDD2×BwBa/Bi + 0.5CgateVDD2×BwBa/Bi + 6CgateVDD2(Radc(Bw-1)+Bw(log2Bw-0.5))/ICH×Ba/Bi + 9CgateVDD2(Radc+Bw+Ba)/ICH×Ba/Bi + + , if Bi<Ba + 0 + , if Bi==Ba + Note:* Agate/Dgate/Cgate: area/delay/capacitance of a NAND2 gate.* VDD: the supply voltage.* Radc/Badds_out: ADC resolution/output precision of the adder tree* ICH: the number of input channels* OCH: the number of output channels* Ba/Bw/Bo: precision of activation/weight/output* Bi: precision of input of each input channel + Bw(100Radc+0.001×4Radc)VDD2/ICH×Ba/Bi + 50BaVDD2/OCH + Delay (ns) + Dgate + CACTI (access time) + 0 + Radc(0.00653ICH+0.64) + 2Dgate(Radc+Bw+Ba-Badds_out) + + , if Bi<Ba + 0 + , if Bi==Ba + 4.8Dgatelog2Bw+2Dgate(Radc+Bw) + ~5% contribution + Area (mm2) + AgateBw×ICH×OCH + CACTI (area) + 0 + (10**(-0.0369Radc+1.206)×2**Radc×10-6)×Bw×OCH + 13.8Agate(Radc+Bw+Ba)×OCH + + , if Bi<Ba + 0 + , if Bi==Ba + 7.8Agate(Radc(Bw-1)+Bw(log2Bw-0.5))×OCH + #MACs + ICH×OCH×Bi/Ba + + + + + + + + + + + + + Cost breakdown for a single AIMC macro (cap-coupling based AIMC, because it doesn't have "en" signal compared to charge-sharing based AIMC, so lower energy) (for template, refer to the picture at the right) + + + + + + + ... + M + + + + + + + + + Bi + 6T + + + ... + M + + + + + + + + 6T + + + ... + M + + + + + + + + 6T + ... + Bw + W[0] + W[1] + W[7] + + + ... + ICH + + + ... + Bw + + ADC + + ADC + + Radc + + + + + + + + + + + + + + + + ... + OCH + ... + + + + DAC + + cap coupling + + + + + + + + + + + + + + + + ... + + + + + + + + + + + + + + + + + + + + + + Bw+Radc + place valueadder tree + adding together according to place value + + Bw + Bo (e.g, 8) + Bi + + DAC + + + Bi + + DAC + + + ... + ICH + Bw+Bi+Radc + + DIMC + fJ/MAC + multiplier + adders + weight cell + accumulators + 0.5CgateVDD2×BwBa + 0 + 6CgateVDD2[(N(Bw+1)-(Bw+log2N+1)]/ICH×Ba/Bi + 9CgateVDD2(log2ICH+Bw+Ba)/ICH×Ba/Bi + + , if Bi<Ba + 0 + , if Bi==Ba + Delay (ns) + Dgate + CACTI (access time) + 2Dgate(log2N+Bw+Ba-Badds_out) + + , if Bi<Ba + 0 + , if Bi==Ba + 4.8Dgatelog2N+2Dgate(Bw+log2N) + Area (mm2) + AgateBw×ICH×OCH + CACTI (area) + 13.8Agate(log2ICH+Bw+Ba)×OCH + + , if Bi<Ba + 0 + , if Bi==Ba + 7.8Agate[(N(Bw+1)-(Bw+log2N+1)]×OCH + #MACs + ICH×OCH×Bi/Ba + + + + + + + + + + + Note:* Agate/Dgate/Cgate: area/delay/capacitance of a NAND2 gate.* VDD: the supply voltage.* Radc: ADC resolution* ICH: the number of input channels* OCH: the number of output channels* Ba/Bw/Bo: precision of activation/weight/output* Bi: precision of input of each input channel* Sa/Sw: sparsity of activation/weight (word level) (For DIMC, "1-Sa" is treated same as input toggle rate) + (1-Sa + (1-Sa)(1-Sw + N=Bi×ICH (#inputs for a adder tree) + Note: formula in code is in more detail and accuracy.Here will neglect tiny detail (e.g., adders delay) + (1-Sa + (1-Sw + (1-Sa + (1-Sw + (1-Sa + Energy/access + input regs + cache + output regs + DRAM + 3CgateVDD2×Ba×ICH + Delay (ns) + 0 + CACTI (access time) + Area (mm2) + 6AgateBa×ICH + CACTI (area) + + + + + + + + + + Peripheral memory instance + CACTI (w_cost/r_cost) + 3CgateVDD2×Bo×OCH + 0 + 6Agate×Bo×OCH + 0 + 0 (no on-chip cost) + 3.7×bw (pJ) + * 3.7 pJ/bit, refer: from: https://my.eng.utah.edu/~cs7810/pres/14-7810-02.pdf (P8) + + + + + + + + diff --git a/zigzag/inputs/validation/hardware/sram_imc/model_validation.png b/zigzag/inputs/validation/hardware/sram_imc/model_validation.png new file mode 100755 index 00000000..a9d236ba Binary files /dev/null and b/zigzag/inputs/validation/hardware/sram_imc/model_validation.png differ