From 652a15c1623966e3a5b752f6e3525e2b0de2b86f Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 1 Mar 2024 10:44:30 +0100 Subject: [PATCH 01/30] Fix collecting duplicate tensors in quantization calibration dataset (#577) * Added deepcopying of inputs collected by InferRequestWrapper. Added a test covering the fixed issue. * Phrasing tweaks * Add soundfile to test requirements * Added librosa to test requirements * Added copying to other data cache appends * Remove the need for real test data * Process __call__ call properly * Addressed suggested changes --- optimum/intel/openvino/quantization.py | 10 +++++-- tests/openvino/test_quantization.py | 40 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 5ec4eac556..331248e023 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import inspect import logging import os @@ -87,11 +88,14 @@ def __init__(self, request, data_cache=None): self.data_cache = data_cache def __call__(self, *args, **kwargs): - self.data_cache.append(*args) + # If __call__ is invoked then self.request must be an instance of CompiledModel + signature = inspect.signature(self.request) + bound_args = signature.bind(*args, **kwargs).arguments + self.data_cache.append(copy.deepcopy(bound_args["inputs"])) return self.request(*args, **kwargs) def infer(self, inputs: Any = None, share_inputs: bool = False): - self.data_cache.append(inputs) + self.data_cache.append(copy.deepcopy(inputs)) return self.request.infer(inputs, share_inputs) def start_async( @@ -102,7 +106,7 @@ def start_async( *, shared_memory: Any = None, ): - self.data_cache.append(inputs) + self.data_cache.append(copy.deepcopy(inputs)) self.request.infer(inputs, share_inputs, share_outputs=True) def wait(self): diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 07a9f14774..16f848da9e 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -16,10 +16,12 @@ import tempfile import unittest +from collections import defaultdict from functools import partial import evaluate import numpy as np +import torch from datasets import load_dataset from parameterized import parameterized import openvino.runtime as ov @@ -30,6 +32,7 @@ AutoModelForCausalLM, AutoModelForTokenClassification, AutoTokenizer, + AutoProcessor, TrainingArguments, default_data_collator, ) @@ -45,6 +48,7 @@ OVModelForSeq2SeqLM, OVModelForSequenceClassification, OVModelForTokenClassification, + OVModelForSpeechSeq2Seq, OVStableDiffusionPipeline, OVStableDiffusionXLPipeline, OVQuantizer, @@ -54,6 +58,7 @@ from optimum.intel.openvino.configuration import INT8_WEIGHT_COMPRESSION_CONFIG +from optimum.intel.openvino.quantization import InferRequestWrapper from optimum.intel.utils.import_utils import is_openvino_version from utils_tests import MODEL_NAMES, get_num_quantized_nodes, _ARCHITECTURES_TO_EXPECTED_INT8 @@ -589,3 +594,38 @@ def compute_metrics(p): tokens = tokenizer("This is a sample input", return_tensors="pt") outputs = model(**tokens) self.assertTrue("logits" in outputs) + + +class InferRequestWrapperTest(unittest.TestCase): + MODEL_ID = ("openai/whisper-tiny.en",) + + @staticmethod + def _generate_random_audio_data(processor): + t = np.linspace(0, 1.0, int(1000), endpoint=False) + audio_data = 0.5 * np.sin((2 + np.random.random()) * np.pi * t) + input_features = processor( + audio_data, + sampling_rate=16000, + return_tensors="pt", + ).input_features + return input_features + + @parameterized.expand(MODEL_ID) + def test_calibration_data_uniqueness(self, model_id): + ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True, compile=True) + processor = AutoProcessor.from_pretrained(model_id) + + calibration_data = [] + ov_model.decoder_with_past.request = InferRequestWrapper(ov_model.decoder_with_past.request, calibration_data) + for _ in range(2): + input_features = self._generate_random_audio_data(processor) + ov_model.generate(input_features) + + data_hashes_per_key = defaultdict(list) + for inputs_dict in calibration_data: + for k, v in inputs_dict.items(): + x = (v.numpy() if isinstance(v, torch.Tensor) else v).copy() + data_hashes_per_key[k].append(hash(x.tobytes())) + for k, data_hashes in data_hashes_per_key.items(): + # All hashes can not be equal because calibration dataset contains at least 2 different samples + self.assertTrue(any(data_hashes[0] != it for it in data_hashes)) From 99cf01ff69399668451b8dae0bc40c6b5efa356d Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 1 Mar 2024 15:00:40 +0100 Subject: [PATCH 02/30] Save an openvino config summarizing all information related to quantization when saving model (#578) * fix doc * remove default compression value * set default compression config when not provided * save openvino config to include quantization configuration * fix style * add test * update setup * style * remove from quantization_config key from ov_config * add test * update setup * modify method name --- docs/source/optimization_ov.mdx | 3 - optimum/intel/openvino/configuration.py | 2 +- optimum/intel/openvino/modeling_base.py | 38 +++++++++-- .../intel/openvino/modeling_base_seq2seq.py | 11 +++- optimum/intel/openvino/modeling_decoder.py | 24 ++++--- optimum/intel/openvino/modeling_diffusion.py | 21 ++++-- optimum/intel/openvino/quantization.py | 11 ++-- optimum/intel/openvino/trainer.py | 66 ++++++++++--------- setup.py | 12 +--- tests/openvino/test_quantization.py | 31 ++++++--- 10 files changed, 139 insertions(+), 80 deletions(-) diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 77dab40159..5686af4bf3 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -38,8 +38,6 @@ save_dir = "ptq_model" def preprocess_function(examples, tokenizer): return tokenizer(examples["sentence"], padding="max_length", max_length=128, truncation=True) -# Load the default quantization configuration detailing the quantization we wish to apply -quantization_config = OVConfig() # Instantiate our OVQuantizer using the desired configuration quantizer = OVQuantizer.from_pretrained(model) # Create the calibration dataset used to perform static quantization @@ -52,7 +50,6 @@ calibration_dataset = quantizer.get_calibration_dataset( ) # Apply static quantization and export the resulting quantized model to OpenVINO IR format quantizer.quantize( - quantization_config=quantization_config, calibration_dataset=calibration_dataset, save_directory=save_dir, ) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 9f3e3a06ca..8ddd005279 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -114,7 +114,7 @@ def __init__( **kwargs, ): super().__init__() - self.compression = compression or DEFAULT_QUANTIZATION_CONFIG + self.compression = compression self.input_info = input_info self.save_onnx_model = save_onnx_model self._enable_standard_onnx_export_option() diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 51633b0210..af00f7a06e 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -57,6 +57,7 @@ def __init__( dynamic_shapes: bool = True, ov_config: Optional[Dict[str, str]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): self.config = config @@ -91,6 +92,10 @@ def __init__( self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None + self._openvino_config = None + if quantization_config: + self._openvino_config = OVConfig(quantization_config=quantization_config) + @staticmethod def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None): """ @@ -143,6 +148,15 @@ def _save_pretrained(self, save_directory: Union[str, Path]): dst_path = os.path.join(save_directory, OV_XML_FILE_NAME) openvino.save_model(self.model, dst_path, compress_to_fp16=False) + self._save_openvino_config(save_directory) + + def _save_openvino_config(self, save_directory: Union[str, Path]): + if self._openvino_config is not None: + if not isinstance(self._openvino_config.quantization_config.dataset, (str, type(None))): + self._openvino_config.quantization_config.dataset = None + + self._openvino_config.save_pretrained(save_directory) + @classmethod def _from_pretrained( cls, @@ -203,12 +217,28 @@ def _from_pretrained( local_files_only=local_files_only, ) - # Give default quantization config if not provided and load_in_8bit=True - if load_in_8bit: - quantization_config = quantization_config or {"bits": 8} + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) model = cls.load_model(model_cache_path, quantization_config=quantization_config) - return cls(model, config=config, model_save_dir=model_cache_path.parent, **kwargs) + return cls( + model, + config=config, + model_save_dir=model_cache_path.parent, + quantization_config=quantization_config, + **kwargs, + ) + + @staticmethod + def _prepare_weight_quantization_config( + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, load_in_8bit: bool = False + ): + # Give default quantization config if not provided and load_in_8bit=True + if not quantization_config and load_in_8bit: + quantization_config = OVWeightQuantizationConfig(bits=8) + elif isinstance(quantization_config, dict): + quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) + + return quantization_config @staticmethod def _cached_file( diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index df9449b0b5..3cb43e61b8 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -58,6 +58,7 @@ def __init__( dynamic_shapes: bool = True, ov_config: Optional[Dict[str, str]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, **kwargs, ): self.config = config @@ -76,6 +77,9 @@ def __init__( self.decoder_model = decoder self.decoder_with_past_model = decoder_with_past self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None + self._openvino_config = None + if quantization_config: + self._openvino_config = OVConfig(quantization_config=quantization_config) def _save_pretrained(self, save_directory: Union[str, Path]): """ @@ -96,6 +100,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): dst_path = os.path.join(save_directory, dst_file_name) openvino.save_model(src_file, dst_path, compress_to_fp16=False) + self._save_openvino_config(save_directory) + @classmethod def _from_pretrained( cls, @@ -155,9 +161,7 @@ def _from_pretrained( decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name decoder_with_past = None - # Give default quantization config if not provided and load_in_8bit=True - if load_in_8bit: - quantization_config = quantization_config or {"bits": 8} + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) # Load model from a local directory if os.path.isdir(model_id): @@ -205,6 +209,7 @@ def _from_pretrained( decoder_with_past=decoder_with_past, config=config, model_save_dir=model_save_dir, + quantization_config=quantization_config, **kwargs, ) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index c0274d3f5b..92a2ce436d 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import logging import os from pathlib import Path @@ -100,6 +101,7 @@ def __init__( dynamic_shapes: bool = True, ov_config: Optional[Dict[str, str]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): if not dynamic_shapes: @@ -117,6 +119,7 @@ def __init__( dynamic_shapes=False, ov_config=ov_config, model_save_dir=model_save_dir, + quantization_config=quantization_config, **kwargs, ) @@ -224,6 +227,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): dst_path = os.path.join(save_directory, OV_XML_FILE_NAME) openvino.save_model(model_to_save, dst_path, compress_to_fp16=False) + self._save_openvino_config(save_directory) + @classmethod def _from_transformers( cls, @@ -576,15 +581,10 @@ def _from_pretrained( local_files_only=local_files_only, ) - # Give default quantization config if not provided and load_in_8bit=True - if load_in_8bit: - quantization_config = quantization_config or {"bits": 8} - - if isinstance(quantization_config, dict): - if quantization_config == {"bits": 4} and config.name_or_path in _DEFAULT_4BIT_CONFIGS: - quantization_config = _DEFAULT_4BIT_CONFIGS[config.name_or_path] + if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}: + quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, quantization_config) - quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) load_in_4bit = quantization_config.bits == 4 if quantization_config else False model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config) @@ -603,7 +603,12 @@ def _from_pretrained( enable_compilation = kwargs.pop("compile", True) and not load_in_4bit causal_model = init_cls( - model=model, config=config, model_save_dir=model_cache_path.parent, compile=enable_compilation, **kwargs + model=model, + config=config, + model_save_dir=model_cache_path.parent, + compile=enable_compilation, + quantization_config=quantization_config, + **kwargs, ) if load_in_4bit: @@ -632,6 +637,7 @@ def _from_pretrained( # seqlen = get_seqlen(causal_model) dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32) dataset = prepare_dataset(dataset) + quantization_config = copy.deepcopy(quantization_config) quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) _weight_only_quantization(model, quantization_config) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 5633f852a8..1570a22457 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -87,6 +87,7 @@ def __init__( compile: bool = True, ov_config: Optional[Dict[str, str]] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): self._internal_dict = config @@ -140,6 +141,10 @@ def __init__( self._internal_dict.pop("vae", None) + self._openvino_config = None + if quantization_config: + self._openvino_config = OVConfig(quantization_config=quantization_config) + def _save_pretrained(self, save_directory: Union[str, Path]): """ Saves the model to the OpenVINO IR format so that it can be re-loaded using the @@ -177,6 +182,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + self._save_openvino_config(save_directory) + @classmethod def _from_pretrained( cls, @@ -257,10 +264,7 @@ def _from_pretrained( else: kwargs[name] = load_method(new_model_save_dir) - # Give default quantization config if not provided and load_in_8bit=True - if load_in_8bit: - quantization_config = quantization_config or {"bits": 8} - + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) unet = cls.load_model( new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, quantization_config ) @@ -278,7 +282,14 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = new_model_save_dir - return cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs) + return cls( + unet=unet, + config=config, + model_save_dir=model_save_dir, + quantization_config=quantization_config, + **components, + **kwargs, + ) @classmethod def _from_transformers( diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 331248e023..d7b88f2be3 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -45,7 +45,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available from ..utils.modeling_utils import get_model_device -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( MAX_ONNX_OPSET, @@ -235,8 +235,11 @@ def quantize( ) ov_config = ov_config or quantization_config - if ov_config is not None and not isinstance(ov_config, OVConfig): - raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + if ov_config is not None: + if not isinstance(ov_config, OVConfig): + raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + elif ov_config.compression is None: + ov_config.compression = DEFAULT_QUANTIZATION_CONFIG if isinstance(self.model, OVBaseModel): self._quantize_ovbasemodel( @@ -355,7 +358,7 @@ def _quantize_torchmodel( logger.info( "No configuration describing the quantization process was provided, a default OVConfig will be generated." ) - ov_config = OVConfig() + ov_config = OVConfig(compression=DEFAULT_QUANTIZATION_CONFIG) onnx_file_name = ( ONNX_WEIGHTS_NAME if file_name is None and ov_config.save_onnx_model diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index 5c7d392292..b7d110c96a 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -89,7 +89,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import is_transformers_version -from .configuration import OVConfig +from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig from .quantization import OVDataLoader from .training_args import OVTrainingArguments from .utils import ( @@ -225,37 +225,41 @@ def __init__( self.teacher.eval() self.compression_controller = None - if self.ov_config is not None and self.args.do_train: - self._set_task() - train_dataloader = self.get_train_dataloader() - model_inputs = next(iter(train_dataloader)) - for label_name in self.label_names: - model_inputs.pop(label_name) - force_batch_one = self._is_pruning_enabled() - self.ov_config.add_input_info(model_inputs, force_batch_one) - nncf_config = NNCFConfig.from_dict(self.ov_config.__dict__) - nncf_config.register_extra_structs( - [ - QuantizationRangeInitArgs(OVDataLoader(train_dataloader)), - BNAdaptationInitArgs(OVDataLoader(train_dataloader)), - ] - ) + if self.ov_config is not None: + if self.ov_config.compression is None: + self.ov_config.compression = DEFAULT_QUANTIZATION_CONFIG + + if self.args.do_train: + self._set_task() + train_dataloader = self.get_train_dataloader() + model_inputs = next(iter(train_dataloader)) + for label_name in self.label_names: + model_inputs.pop(label_name) + force_batch_one = self._is_pruning_enabled() + self.ov_config.add_input_info(model_inputs, force_batch_one) + nncf_config = NNCFConfig.from_dict(self.ov_config.__dict__) + nncf_config.register_extra_structs( + [ + QuantizationRangeInitArgs(OVDataLoader(train_dataloader)), + BNAdaptationInitArgs(OVDataLoader(train_dataloader)), + ] + ) - # Configure NNCF logging - # Disable nncf logging to stdout except error - # but to file nncf_output.log - nncf_config["log_dir"] = args.output_dir - nncf_log_file_handler = logging.logging.FileHandler(os.path.join(args.output_dir, NNCF_LOG_FILE_NAME)) - nncf_log_file_handler.setFormatter(logging.logging.Formatter("%(levelname)s:%(name)s:%(message)s")) - nncf_logger.addHandler(nncf_log_file_handler) - set_log_level(logging.ERROR) - nncf_logger.setLevel(logging.INFO) - nncf_log_file_handler.setLevel(logging.INFO) - - self.compression_controller, self.model = create_compressed_model(self.model, nncf_config) - self.model_wrapped = self.model - # TODO : To deprecate once support transformers > 4.30.0 - self.deepspeed = None + # Configure NNCF logging + # Disable nncf logging to stdout except error + # but to file nncf_output.log + nncf_config["log_dir"] = args.output_dir + nncf_log_file_handler = logging.logging.FileHandler(os.path.join(args.output_dir, NNCF_LOG_FILE_NAME)) + nncf_log_file_handler.setFormatter(logging.logging.Formatter("%(levelname)s:%(name)s:%(message)s")) + nncf_logger.addHandler(nncf_log_file_handler) + set_log_level(logging.ERROR) + nncf_logger.setLevel(logging.INFO) + nncf_log_file_handler.setLevel(logging.INFO) + + self.compression_controller, self.model = create_compressed_model(self.model, nncf_config) + self.model_wrapped = self.model + # TODO : To deprecate once support transformers > 4.30.0 + self.deepspeed = None def _set_signature_columns_if_needed(self): if self._signature_columns is None: diff --git a/setup.py b/setup.py index 1701af990c..91fc19f744 100644 --- a/setup.py +++ b/setup.py @@ -39,16 +39,8 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] EXTRAS_REQUIRE = { - "neural-compressor": [ - "neural-compressor>=2.2.0", - "onnx", - "onnxruntime<1.15.0", - ], - "openvino": [ - "openvino>=2023.3", - "onnx", - "onnxruntime", - ], + "neural-compressor": ["neural-compressor>=2.2.0", "onnx", "onnxruntime<1.15.0"], + "openvino": ["openvino>=2023.3", "onnx", "onnxruntime"], "openvino-tokenizers": ["openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.8.1"], "ipex": ["intel-extension-for-pytorch", "onnx"], diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 16f848da9e..0ef89ec8b8 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -56,8 +56,7 @@ OVWeightQuantizationConfig, ) - -from optimum.intel.openvino.configuration import INT8_WEIGHT_COMPRESSION_CONFIG +from optimum.intel.openvino.configuration import INT8_WEIGHT_COMPRESSION_CONFIG, DEFAULT_QUANTIZATION_CONFIG from optimum.intel.openvino.quantization import InferRequestWrapper from optimum.intel.utils.import_utils import is_openvino_version from utils_tests import MODEL_NAMES, get_num_quantized_nodes, _ARCHITECTURES_TO_EXPECTED_INT8 @@ -111,9 +110,8 @@ def preprocess_function(examples, tokenizer): self.assertTrue("logits" in outputs) # Verify that that the configuration is correctly saved and loaded - expected_config = OVConfig() loaded_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(expected_config.to_dict()["compression"], loaded_config.to_dict()["compression"]) + self.assertEqual(DEFAULT_QUANTIZATION_CONFIG, loaded_config.to_dict()["compression"]) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): @@ -160,7 +158,7 @@ class OVWeightCompressionTest(unittest.TestCase): ) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 64, 365),) - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 6, 379),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 388),) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS = ( (OVModelForCausalLM, "hf-internal-testing/tiny-random-OPTForCausalLM", 16, 136), ) @@ -237,6 +235,8 @@ class OVWeightCompressionTest(unittest.TestCase): IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") + DEFAULT_INT4_CONFIG = {"bits": 4, "sym": True, "group_size": 64, "all_layers": True} + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature @@ -336,6 +336,8 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False) + self.assertEqual(model._openvino_config.quantization_config.bits, 8) + self.assertEqual(model._openvino_config.dtype, "int8") if model.export_feature.startswith("text2text-generation"): models = [model.encoder, model.decoder, model.decoder_with_past] @@ -351,12 +353,13 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): self.assertEqual(expected_ov_int8[i], num_int8) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS) + @unittest.mock.patch.dict( + "optimum.intel.openvino.configuration._DEFAULT_4BIT_CONFIGS", {"facebook/opt-125m": DEFAULT_INT4_CONFIG} + ) def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_int8, expected_ov_int4): with tempfile.TemporaryDirectory() as tmp_dir: model_id = MODEL_NAMES[model_type] - model = model_cls.from_pretrained( - model_id, export=True, quantization_config=OVWeightQuantizationConfig(bits=4) - ) + model = model_cls.from_pretrained(model_id, export=True, quantization_config={"bits": 4}) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -366,6 +369,13 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ self.assertEqual(expected_ov_int8, num_int8) model.save_pretrained(tmp_dir) + openvino_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.dtype, "int4") + if model_id == "facebook/opt-125m": + for key, value in self.DEFAULT_INT4_CONFIG.items(): + self.assertEqual(value, openvino_config.quantization_config[key]) + @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( self, model_cls, model_id, quantization_config, expected_ov_int4 @@ -380,8 +390,9 @@ def test_ovmodel_4bit_auto_compression_with_config( self.assertEqual(expected_ov_int4, num_int4) model.save_pretrained(tmp_dir) - ov_config = OVConfig(quantization_config=quantization_config) - ov_config.save_pretrained(tmp_dir) + openvino_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.dtype, "int4") @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS) def test_ovmodel_4bit_auto_compression_with_custom_dataset( From eb0dcdb459d7b05055a6d72ebbf62ed7f60ed80b Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 4 Mar 2024 11:10:55 +0100 Subject: [PATCH 03/30] Fix warning (#582) * Fix warning * fix message warning --- optimum/intel/generation/modeling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/intel/generation/modeling.py b/optimum/intel/generation/modeling.py index 0abdafe666..3d9c657626 100644 --- a/optimum/intel/generation/modeling.py +++ b/optimum/intel/generation/modeling.py @@ -105,13 +105,13 @@ def __init__( self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) self.model_dtype = kwargs.get("model_dtype", None) - logger.warning( - f"The class `{self.__class__}` has been depreciated and will be removed in optimum-intel v1.14, please use IPEXModel instead" - ) if isinstance(model, torch.jit.ScriptModule): self.input_names = { inputs.debugName().split(".")[0] for inputs in model.graph.inputs() if inputs.debugName() != "self" } + logger.warning( + f"The class `{self.__class__}` has been depreciated for TorchScript model, please use `IPEXModelForCausalLM` instead" + ) else: self.input_names = set() From 77365f458d09a166b1af9b0b043443e7bc2b2318 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 4 Mar 2024 11:27:39 +0100 Subject: [PATCH 04/30] Add reference to the temporary directory for windows fix (#581) --- optimum/intel/openvino/modeling_diffusion.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 1570a22457..5e8a0cdc59 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -94,9 +94,18 @@ def __init__( self._device = device.upper() self.is_dynamic = dynamic_shapes self.ov_config = ov_config if ov_config is not None else {} - self._model_save_dir = ( - Path(model_save_dir.name) if isinstance(model_save_dir, TemporaryDirectory) else model_save_dir - ) + + # This attribute is needed to keep one reference on the temporary directory, since garbage collecting + # would end-up removing the directory containing the underlying OpenVINO model + self._model_save_dir_tempdirectory_instance = None + if isinstance(model_save_dir, TemporaryDirectory): + self._model_save_dir_tempdirectory_instance = model_save_dir + self._model_save_dir = Path(model_save_dir.name) + elif isinstance(model_save_dir, str): + self._model_save_dir = Path(model_save_dir) + else: + self._model_save_dir = model_save_dir + self.vae_decoder = OVModelVaeDecoder(vae_decoder, self) self.unet = OVModelUnet(unet, self) self.text_encoder = OVModelTextEncoder(text_encoder, self) if text_encoder is not None else None From 5e319aae58f154af06bf9d9d125b138ed79d2450 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 4 Mar 2024 12:02:27 +0100 Subject: [PATCH 05/30] Fix documentation (#583) * Fix documentation * fix --- docs/source/inference.mdx | 7 +++++-- docs/source/optimization_ov.mdx | 6 +++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx index a9ee5529da..905e0aa4dd 100644 --- a/docs/source/inference.mdx +++ b/docs/source/inference.mdx @@ -110,7 +110,7 @@ By default the quantization scheme will be [assymmetric](https://github.com/open For INT4 quantization you can also specify the following arguments : * The `--group-size` parameter will define the group size to use for quantization, `-1` it will results in per-column quantization. -* The `--ratio` CLI parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`. +* The `--ratio` parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`. Smaller `group_size` and `ratio` of usually improve accuracy at the sacrifice of the model size and inference latency. @@ -122,8 +122,11 @@ from optimum.intel import OVModelForCausalLM model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) ``` -> **NOTE:** `load_in_8bit` is enabled by default for the models larger than 1 billion parameters. + +`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. + + To apply quantization on both weights and activations, you can use the `OVQuantizer`, more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#optimization). diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 5686af4bf3..51067b0b64 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -69,7 +69,11 @@ from optimum.intel import OVModelForCausalLM model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) ``` -> **NOTE:** `load_in_8bit` is enabled by default for models larger than 1 billion parameters. + + +`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. + + For the 4-bit weight quantization you can use the `quantization_config` to specify the optimization parameters, for example: From f1984f4fe0b099a8159e75883cbd65eb2dfd3754 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Wed, 6 Mar 2024 18:08:32 +0800 Subject: [PATCH 06/30] Add llama test model to cover MQA (#585) * change llama test model to cover MQA * keep llama and llama2 in tests * fix code style --- tests/generation/test_modeling.py | 2 ++ tests/ipex/test_inference.py | 2 ++ tests/ipex/test_modeling.py | 6 +++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/generation/test_modeling.py b/tests/generation/test_modeling.py index b97fd66a83..9b637d322d 100644 --- a/tests/generation/test_modeling.py +++ b/tests/generation/test_modeling.py @@ -31,6 +31,7 @@ "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", "mistral": "echarlaix/tiny-random-mistral", "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama2": "Jiqing/tiny_random_llama2", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", } @@ -54,6 +55,7 @@ class ModelingIntegrationTest(unittest.TestCase): "gpt_neo", "mistral", "llama", + "llama2", # "gpt_bigcode", ) diff --git a/tests/ipex/test_inference.py b/tests/ipex/test_inference.py index bc1890453d..e120514506 100644 --- a/tests/ipex/test_inference.py +++ b/tests/ipex/test_inference.py @@ -42,6 +42,7 @@ "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama2": "Jiqing/tiny_random_llama2", "opt": "hf-internal-testing/tiny-random-OPTModel", "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", } @@ -66,6 +67,7 @@ class IPEXIntegrationTest(unittest.TestCase): "gpt_neo", # "gpt_bigcode", "llama", + "llama2", "opt", "mpt", ) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index ffc2ca6a89..03b7d015d1 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -67,6 +67,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJModel", "levit": "hf-internal-testing/tiny-random-LevitModel", "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama2": "Jiqing/tiny_random_llama2", "marian": "sshleifer/tiny-marian-en-de", "mbart": "hf-internal-testing/tiny-random-mbart", "mistral": "echarlaix/tiny-random-mistral", @@ -209,6 +210,7 @@ class IPEXModelForCausalLMTest(unittest.TestCase): "gpt_neo", "gpt_neox", "llama", + "llama2", "mistral", # "phi", "mpt", @@ -226,7 +228,9 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue(ipex_model.use_cache) tokenizer = AutoTokenizer.from_pretrained(model_id) tokens = tokenizer( - "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None + "This is a sample", + return_tensors="pt", + return_token_type_ids=False if model_arch in ("llama", "llama2") else None, ) position_ids = None if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: From 1a821b64cd98a93f41988571c6c468f3f8e86fd6 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 6 Mar 2024 16:27:45 +0400 Subject: [PATCH 07/30] Include nncf in openvino extra (#586) --- README.md | 2 +- docs/source/installation.mdx | 2 +- optimum/exporters/openvino/model_patcher.py | 2 +- setup.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7b762cce26..ec35e602ca 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi | Accelerator | Installation | |:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------| | [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"` | -| [OpenVINO](https://docs.openvino.ai/latest/index.html) | `pip install --upgrade-strategy eager "optimum[openvino,nncf]"` | +| [OpenVINO](https://docs.openvino.ai/latest/index.html) | `pip install --upgrade-strategy eager "optimum[openvino]"` | | [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade-strategy eager "optimum[ipex]"` | The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index cf8688d105..87698a6514 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -21,7 +21,7 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi | Accelerator | Installation | |:-----------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------| | [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"`| -| [Intel OpenVINO](https://docs.openvino.ai/latest/index.html) | `pip install --upgrade-strategy eager "optimum[openvino,nncf]"` | +| [Intel OpenVINO](https://docs.openvino.ai/latest/index.html) | `pip install --upgrade-strategy eager "optimum[openvino]"` | The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f953771a7a..aea57161e2 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -35,7 +35,7 @@ def patch_model_with_bettertransformer(model): + "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. " f"This Python environment has Transformers {_transformers_version} and PyTorch {_torch_version}. " "Consider upgrading PyTorch and Transformers, for example by running " - "`pip install --upgrade --upgrade-strategy eager optimum[openvino,nncf]`, and export the model again" + "`pip install --upgrade --upgrade-strategy eager optimum[openvino]`, and export the model again" + COLOR_RESET ) diff --git a/setup.py b/setup.py index 91fc19f744..dd98548018 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ EXTRAS_REQUIRE = { "neural-compressor": ["neural-compressor>=2.2.0", "onnx", "onnxruntime<1.15.0"], - "openvino": ["openvino>=2023.3", "onnx", "onnxruntime"], + "openvino": ["openvino>=2023.3", "onnx", "onnxruntime", "nncf>=2.8.1"], "openvino-tokenizers": ["openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.8.1"], "ipex": ["intel-extension-for-pytorch", "onnx"], From 8f75658b160a19762e8020712a752809ab0761d3 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 6 Mar 2024 14:16:08 +0100 Subject: [PATCH 08/30] Fix title documentation (#588) --- docs/source/inference.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx index 905e0aa4dd..82e2315454 100644 --- a/docs/source/inference.mdx +++ b/docs/source/inference.mdx @@ -471,7 +471,7 @@ image = refiner(prompt=prompt, image=image[None, :]).images[0] ``` -## Latent Consistency Models +### Latent Consistency Models | Task | Auto Class | @@ -479,7 +479,7 @@ image = refiner(prompt=prompt, image=image[None, :]).images[0] | `text-to-image` | `OVLatentConsistencyModelPipeline` | -### Text-to-Image +#### Text-to-Image Here is an example of how you can load a Latent Consistency Models (LCMs) from [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and run inference using OpenVINO : From 81e180f273068749eebc4963fe0c1be84a2457b6 Mon Sep 17 00:00:00 2001 From: Karol Blaszczak Date: Thu, 7 Mar 2024 10:30:25 +0100 Subject: [PATCH 09/30] Update OpenVINO documentation links in README.md (#587) * Update OpenVINO documentation links in README.md The links are now aligned with OpenVINO 2024.0 documentation, and include permalinks instead of direct links, when possible. * Update inference.mdx * Update index.mdx * Update installation.mdx * Update README.md --- README.md | 8 ++++---- docs/source/index.mdx | 4 ++-- docs/source/inference.mdx | 7 ++++--- docs/source/installation.mdx | 4 ++-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index ec35e602ca..7905cefded 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target. -[OpenVINO](https://docs.openvino.ai/latest/index.html) is an open-source toolkit that enables high performance inference capabilities for Intel CPUs, GPUs, and special DL inference accelerators ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices). It is supplied with a set of tools to optimize your models with compression techniques such as quantization, pruning and knowledge distillation. Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime. +[OpenVINO](https://docs.openvino.ai) is an open-source toolkit that enables high performance inference capabilities for Intel CPUs, GPUs, and special DL inference accelerators ([see](https://docs.openvino.ai/2024/about-openvino/compatibility-and-support/supported-devices.html) the full list of supported devices). It is supplied with a set of tools to optimize your models with compression techniques such as quantization, pruning and knowledge distillation. Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime. ## Installation @@ -20,7 +20,7 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi | Accelerator | Installation | |:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------| | [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"` | -| [OpenVINO](https://docs.openvino.ai/latest/index.html) | `pip install --upgrade-strategy eager "optimum[openvino]"` | +| [OpenVINO](https://docs.openvino.ai) | `pip install --upgrade-strategy eager "optimum[openvino]"` | | [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade-strategy eager "optimum[ipex]"` | The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. @@ -68,11 +68,11 @@ For more details on the supported compression techniques, please refer to the [d ## OpenVINO -Below are the examples of how to use OpenVINO and its [NNCF](https://docs.openvino.ai/latest/tmo_introduction.html) framework to accelerate inference. +Below are examples of how to use OpenVINO and its [NNCF](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/compressing-models-during-training.html) framework to accelerate inference. #### Export: -It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2023.1/openvino_ir.html) IR format with the CLI : +It is possible to export your model to the [OpenVINO IR](https://docs.openvino.ai/2024/documentation/openvino-ir-format.html) format with the CLI : ```plain optimum-cli export openvino --model gpt2 ov_model diff --git a/docs/source/index.mdx b/docs/source/index.mdx index cbec79baa9..643b9be044 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -21,7 +21,7 @@ limitations under the License. [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target. -[OpenVINO](https://docs.openvino.ai/latest/index.html) is an open-source toolkit that enables high performance inference capabilities for Intel CPUs, GPUs, and special DL inference accelerators ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices). It is supplied with a set of tools to optimize your models with compression techniques such as quantization, pruning and knowledge distillation. Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime. +[OpenVINO](https://docs.openvino.ai) is an open-source toolkit that enables high performance inference capabilities for Intel CPUs, GPUs, and special DL inference accelerators ([see](https://docs.openvino.ai/2024/about-openvino/compatibility-and-support/supported-devices.html) the full list of supported devices). It is supplied with a set of tools to optimize your models with compression techniques such as quantization, pruning and knowledge distillation. Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime.
@@ -34,4 +34,4 @@ limitations under the License.

Learn how to run inference with OpenVINO Runtime and to apply quantization, pruning and knowledge distillation on your model to further speed up inference.

-
\ No newline at end of file + diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx index 82e2315454..65480c1d2f 100644 --- a/docs/source/inference.mdx +++ b/docs/source/inference.mdx @@ -13,7 +13,8 @@ Optimum Intel can be used to load optimized models from the [Hugging Face Hub](h ## Transformers models -You can now easily perform inference with OpenVINO Runtime on a variety of Intel processors ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices). +You can now easily perform inference with OpenVINO Runtime on a variety of Intel processors +([see](https://docs.openvino.ai/2024/about-openvino/compatibility-and-support/supported-devices.html) the full list of supported devices). For that, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. As shown in the table below, each task is associated with a class enabling to automatically load your model. @@ -33,7 +34,7 @@ As shown in the table below, each task is associated with a class enabling to au ### Export -It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2023.1/openvino_ir.html) IR format with the CLI : +It is possible to export your model to the [OpenVINO IR](https://docs.openvino.ai/2024/documentation/openvino-ir-format.html) format with the CLI : ```bash optimum-cli export openvino --model gpt2 ov_model @@ -182,7 +183,7 @@ model.reshape(1,128) model.compile() ``` -To run inference on Intel integrated or discrete GPU, use `.to("gpu")`. On GPU, models run in FP16 precision by default. (See [OpenVINO documentation](https://docs.openvino.ai/nightly/openvino_docs_install_guides_configurations_for_intel_gpu.html) about installing drivers for GPU inference). +To run inference on Intel integrated or discrete GPU, use `.to("gpu")`. On GPU, models run in FP16 precision by default. (See [OpenVINO documentation](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html) about installing drivers for GPU inference). ```python # Static shapes speed up inference diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 87698a6514..c29f5ceb95 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -21,7 +21,7 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi | Accelerator | Installation | |:-----------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------| | [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"`| -| [Intel OpenVINO](https://docs.openvino.ai/latest/index.html) | `pip install --upgrade-strategy eager "optimum[openvino]"` | +| [Intel OpenVINO](https://docs.openvino.ai ) | `pip install --upgrade-strategy eager "optimum[openvino]"` | The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. @@ -42,4 +42,4 @@ or to install from source including dependencies: python -m pip install "optimum-intel[extras]"@git+https://github.com/huggingface/optimum-intel.git ``` -where `extras` can be one or more of `neural-compressor`, `openvino`, `nncf`. \ No newline at end of file +where `extras` can be one or more of `neural-compressor`, `openvino`, `nncf`. From 8c95cae69b72547cce9caef4e018925593f220ed Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 7 Mar 2024 16:50:54 +0100 Subject: [PATCH 10/30] Fix default int8 quantization for CLI (#592) --- optimum/commands/export/openvino.py | 19 +++++++++---------- optimum/exporters/openvino/__main__.py | 17 ++--------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 255e2a7e13..997ec44aa5 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -157,13 +157,12 @@ def run(self): ) self.args.weight_format = "int8" - weight_format = self.args.weight_format or "fp32" - - ov_config = None - if weight_format in {"fp16", "fp32"}: - ov_config = OVConfig(dtype=weight_format) + if self.args.weight_format is None: + ov_config = None + elif self.args.weight_format in {"fp16", "fp32"}: + ov_config = OVConfig(dtype=self.args.weight_format) else: - is_int8 = weight_format == "int8" + is_int8 = self.args.weight_format == "int8" # For int4 quantization if not parameter is provided, then use the default config if exist if ( @@ -182,12 +181,12 @@ def run(self): "group_size": -1 if is_int8 else self.args.group_size, } - if weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}: + if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}: logger.warning( - f"--weight-format {weight_format} is deprecated, possible choices are fp32, fp16, int8, int4" + f"--weight-format {self.args.weight_format} is deprecated, possible choices are fp32, fp16, int8, int4" ) - quantization_config["sym"] = "asym" not in weight_format - quantization_config["group_size"] = 128 if "128" in weight_format else 64 + quantization_config["sym"] = "asym" not in self.args.weight_format + quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64 ov_config = OVConfig(quantization_config=quantization_config) # TODO : add input shapes diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 24b65f9032..1c695e2f19 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -21,26 +21,13 @@ from optimum.exporters import TasksManager from optimum.exporters.onnx.base import OnnxConfig +from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED from optimum.utils.save_utils import maybe_load_preprocessors -from ...intel.utils.import_utils import ( - is_openvino_tokenizers_available, - is_optimum_version, - is_transformers_version, -) +from ...intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version from .convert import export_from_model, export_tokenizer -if is_optimum_version(">=", "1.16.0"): - from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED -else: - # Copied from https://github.com/huggingface/optimum/blob/main/optimum/exporters/onnx/constants.py - SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED = [ - "bart", - "whisper", - ] - - if TYPE_CHECKING: from optimum.intel.openvino.configuration import OVConfig From c356aa392301a507c2ce01ac86273955ceab7a76 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Fri, 8 Mar 2024 17:16:35 +0800 Subject: [PATCH 11/30] Change model output parameter to last_hidden_states for IPEXModel (#589) * change model output parameter to last_hidden_states * update ipex model testiong * update testing * add output name to ipex model --- optimum/intel/ipex/modeling_base.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 2b6b569343..9928977ead 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -58,6 +58,7 @@ class IPEXModel(OptimizedModel): export_feature = "feature-extraction" base_model_prefix = "ipex_model" main_input_name = "input_ids" + output_name = "last_hidden_state" def __init__( self, @@ -193,7 +194,12 @@ def forward( inputs["token_type_ids"] = token_type_ids outputs = self._call_model(**inputs) - return ModelOutput(**outputs) if isinstance(outputs, dict) else ModelOutput(logits=outputs[0]) + if isinstance(outputs, dict): + model_output = ModelOutput(**outputs) + else: + model_output = ModelOutput() + model_output[self.output_name] = outputs[0] + return model_output def eval(self): self.model.eval() @@ -235,16 +241,19 @@ def _init_warmup(self): class IPEXModelForSequenceClassification(IPEXModel): auto_model_class = AutoModelForSequenceClassification export_feature = "text-classification" + output_name = "logits" class IPEXModelForTokenClassification(IPEXModel): auto_model_class = AutoModelForTokenClassification export_feature = "token-classification" + output_name = "logits" class IPEXModelForMaskedLM(IPEXModel): auto_model_class = AutoModelForMaskedLM export_feature = "fill-mask" + output_name = "logits" class IPEXModelForImageClassification(IPEXModel): From 6e8cd3d8134438b7a488d98a48d62607c3185412 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Fri, 8 Mar 2024 18:48:02 +0800 Subject: [PATCH 12/30] Add IPEX model patcher (#567) * llama model patcher * fix jit model * fix jit model * rm autocast in model * add llama model patcher * support assisted decoding and add reorder cache function * add comment for _prepare_past_key_values * rebase main * fix model_dtype * rm useless comments * fix llama * add comments for ipex_rope and ipex_scale_dot_product * fix comments * add enable_tpp comments * fix import * fix review aroun2 * add torch.no_grad to avoid auto_kernel_selection issue * use torch.no_grad in jit trace * fix ipex model testing * add tests for ipex model generation with multi inputs * fix code style * remove __get__(self) as _reorder_cache is static method for the class * fix reorder_cache * use model_type * check if reorder_cache is a static method * fix _reorder_cache * fix raise import error * test ipex patching * fix comments * update API name and testing * disable untill ipex version 2.5.0 * update testing name * Update optimum/intel/ipex/modeling_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update tests/ipex/test_modeling.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * fix tests --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/exporters/ipex/__init__.py | 0 optimum/exporters/ipex/model_patcher.py | 91 +++++++ optimum/exporters/ipex/modeling_utils.py | 307 +++++++++++++++++++++++ optimum/intel/ipex/modeling_base.py | 135 ++++++++-- tests/ipex/test_modeling.py | 39 +++ 5 files changed, 558 insertions(+), 14 deletions(-) create mode 100644 optimum/exporters/ipex/__init__.py create mode 100644 optimum/exporters/ipex/model_patcher.py create mode 100644 optimum/exporters/ipex/modeling_utils.py diff --git a/optimum/exporters/ipex/__init__.py b/optimum/exporters/ipex/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py new file mode 100644 index 0000000000..60ff3b721b --- /dev/null +++ b/optimum/exporters/ipex/model_patcher.py @@ -0,0 +1,91 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from transformers.models.llama.modeling_llama import ( + LlamaAttention, + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaModel, + LlamaRMSNorm, +) + +from optimum.intel.utils.import_utils import is_ipex_version + +from .modeling_utils import ( + _IPEXLlamaDecoderLayerRef, + _llama_attn_forward, + _llama_layer_norm_forward, + _llama_model_forward, +) + + +_IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",) +_IPEX_EXPORTED_TASK = ("text-generation",) + + +def convert_func(m, func_name, new_function): + bound_method = new_function.__get__(m, m.__class__) + setattr(m, func_name, bound_method) + + +def convert_functions(m, target_m, new_function_name, new_function): + for _, sub_m in m.named_children(): + if isinstance(sub_m, target_m): + convert_func(sub_m, new_function_name, new_function) + convert_functions(sub_m, target_m, new_function_name, new_function) + + +def convert_class(m, target_m, new_class, config, distributed=False): + for name, sub_m in m.named_children(): + if isinstance(sub_m, target_m): + new_m = new_class(sub_m, config, distributed) + setattr(m, name, new_m) + convert_class(sub_m, target_m, new_class, config, distributed) + + +def patch_op(m, target_m, new_op_name, new_op): + for name, sub_m in m.named_children(): + if isinstance(sub_m, target_m): + setattr(sub_m, new_op_name, new_op) + patch_op(sub_m, target_m, new_op_name, new_op) + + +def _patch_llama_model(model): + if is_ipex_version("<", "2.5.0"): + raise ImportError("Only ipex version > 2.3.0 supports RotaryEmbedding and IndirectAccessKVCache") + + from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCache, RotaryEmbedding + + ipex_rope = RotaryEmbedding( + model.config.max_position_embeddings, + model.config.hidden_size // model.config.num_attention_heads, + model.config.rope_theta, + model.config.architectures[0], + ) + ipex_scale_dot_product = IndirectAccessKVCache(text_max_length=model.config.max_position_embeddings) + patch_op(model, LlamaAttention, "ipex_rope", ipex_rope) + patch_op(model, LlamaAttention, "ipex_scale_dot_product", ipex_scale_dot_product) + + convert_functions(model, LlamaModel, "forward", _llama_model_forward) + convert_functions(model, LlamaAttention, "forward", _llama_attn_forward) + convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward) + + convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayerRef, model.config) + return model + + +def _patch_model(model): + if isinstance(model, LlamaForCausalLM): + model = _patch_llama_model(model) + return model diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py new file mode 100644 index 0000000000..f75e559eaf --- /dev/null +++ b/optimum/exporters/ipex/modeling_utils.py @@ -0,0 +1,307 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import List, Optional, Tuple, Union + +import torch +from torch import nn +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from transformers.modeling_outputs import BaseModelOutputWithPast +from transformers.models.llama.modeling_llama import repeat_kv + +from optimum.intel.utils.import_utils import is_ipex_version + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83 +def _llama_layer_norm_forward(self, hidden_states): + return torch.ops.torch_ipex.rmsnorm(hidden_states, self.weight, self.variance_epsilon) + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L321 +def _llama_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query = self.q_proj(hidden_states) + key = self.k_proj(hidden_states) + value = self.v_proj(hidden_states) + + kv_seq_len = q_len + past_key_value[0].size(-2) if past_key_value is not None else q_len + + query = query.view(bsz, q_len, self.num_heads, self.head_dim) + key = key.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value = value.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + # Use ipex op to rotary position embedding more efficient. + key = self.ipex_rope( + key, + position_ids, + self.num_key_value_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + ) + query = self.ipex_rope( + query, + position_ids, + self.num_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + ) + + if use_cache: + # This ipex op pre-allocates buffers for past_key_values and use beam index history + # which to decide which beam should be used to make attention scale dot more efficient. + (attn_output, attn_weights, past_key_value) = self.ipex_scale_dot_product( + query, + key, + value, + math.sqrt(self.head_dim), + past_key_value, + None, + attention_mask, + ) + else: + value_states = value.transpose(1, 2) + query_states = query.transpose(1, 2) + key_states = key.transpose(1, 2) + kv_seq_len = key_states.shape[-2] + + past_key_value = None + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: + attn_weights = torch.tensor(attn_weights) + torch.tensor(attention_mask) + attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L1130 +def _llama_model_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, +) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if getattr(self.config, "_flash_attn_2_enabled", False): + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L694 +class _IPEXLlamaDecoderLayerRef(nn.Module): + def __init__(self, module, config, distributed=False): + if is_ipex_version("<", "2.5.0"): + raise ImportError("Only ipex version > 2.3.0 supports Linear2SiluMul and LinearAdd") + + from intel_extension_for_pytorch.llm.modules import Linear2SiluMul, LinearAdd + + super().__init__() + for k, v in module.__dict__.items(): + setattr(self, k, v) + for k, v in module.__class__.__dict__.items(): + if k.startswith("__") or k.startswith("forward"): + continue + setattr(self.__class__, k, getattr(module.__class__, k)) + self.distributed = distributed + if not self.distributed: + self.mha_linear_add = LinearAdd(module.self_attn.o_proj) + self.mlp_linear_add = LinearAdd(module.mlp.down_proj) + del self.__dict__["_modules"]["self_attn"].o_proj + del self.__dict__["_modules"]["mlp"].down_proj + self.linear_silu_mul = Linear2SiluMul(module.mlp.gate_proj, module.mlp.up_proj) + del self.__dict__["_modules"]["mlp"].gate_proj + del self.__dict__["_modules"]["mlp"].up_proj + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + if not self.distributed: + hidden_states = self.mha_linear_add(hidden_states, residual) + else: + hidden_states = self.self_attn.o_proj(hidden_states) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + + mlp_gate = self.linear_silu_mul(hidden_states) + + if not self.distributed: + hidden_states = self.mlp_linear_add(mlp_gate, residual) + else: + hidden_states = self.mlp.down_proj(mlp_gate) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 9928977ead..00fe3de115 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -22,6 +22,8 @@ import intel_extension_for_pytorch as ipex import torch from huggingface_hub import hf_hub_download +from intel_extension_for_pytorch.cpu._auto_kernel_selection import _enable_tpp +from intel_extension_for_pytorch.transformers.optimize import get_dummy_input from transformers import ( AutoConfig, AutoModel, @@ -45,14 +47,63 @@ from optimum.modeling_base import OptimizedModel from optimum.utils import NormalizedConfigManager -from ..generation.modeling import jit_trace, prepare_jit_inputs -from ..utils.import_utils import is_torch_version, is_transformers_version +from ...exporters.ipex.model_patcher import _IPEX_EXPORTED_TASK, _patch_model +from ..generation.modeling import prepare_jit_inputs +from ..utils.import_utils import is_ipex_version, is_torch_version, is_transformers_version from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask logger = logging.getLogger(__name__) +_IPEX_SUPPORT_MODEL_TYPES = ("llama",) + + +def _is_patched_with_ipex(model, task): + if is_ipex_version("<", "2.5.0"): + return False + + if isinstance(model, torch.jit.ScriptModule): + for node in model.graph.nodes(): + # Jit will record the codes position so we can check if the node use ipex exporter. + if "torch_ipex::rotary_position_embedding" in node.__str__(): + return True + return False + else: + return model.config.model_type in _IPEX_SUPPORT_MODEL_TYPES and task in _IPEX_EXPORTED_TASK + + +def ipex_jit_trace(model, task, use_cache): + # Only support torch version >= 2.1.0 to support example_kwarg_inputs in jit.trace + if is_torch_version("<", "2.1.0"): + raise ImportError("`torch>=2.1.0` is needed to trace your model") + + if _is_patched_with_ipex(model, task): + model = _patch_model(model) + sample_inputs = get_dummy_input(model, return_dict=True) + # Use Tensor Processing Primitives to accelerate linear, see https://arxiv.org/abs/2104.05755. + _enable_tpp() + else: + model = patch_decoder_attention_mask(model) + sample_inputs = prepare_jit_inputs(model, task, use_cache) + + model.config.return_dict = False + + model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True) + with torch.no_grad(): + trace_model = torch.jit.trace( + model, + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + trace_model = torch.jit.freeze(trace_model) + trace_model(**sample_inputs) + trace_model(**sample_inputs) + + return trace_model + + class IPEXModel(OptimizedModel): auto_model_class = AutoModel export_feature = "feature-extraction" @@ -74,6 +125,7 @@ def __init__( self._dtype = self.config.torch_dtype if self.config.torch_dtype is not None else torch.float32 self.model.to(self._device) self.model_save_dir = model_save_dir + self._is_ipex_exported = _is_patched_with_ipex(model, self.export_feature) self.input_names = { inputs.debugName().split(".")[0] for inputs in model.graph.inputs() if inputs.debugName() != "self" @@ -91,13 +143,13 @@ def _from_transformers( cls, model_id: str, config: PretrainedConfig, + use_cache: bool = True, use_auth_token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, force_download: bool = False, cache_dir: Optional[str] = None, subfolder: str = "", local_files_only: bool = False, - use_cache: bool = True, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: bool = False, ): @@ -117,14 +169,13 @@ def _from_transformers( } model = TasksManager.get_model_from_task(task, model_id, **model_kwargs) - model = patch_decoder_attention_mask(model) - model = ipex.optimize(model, dtype=torch_dtype, level="O1", auto_kernel_selection=True) - traced_model = jit_trace(model, task, use_cache) + traced_model = ipex_jit_trace(model, task, use_cache) save_dir = TemporaryDirectory() save_dir_path = Path(save_dir.name) torch.jit.save(traced_model, save_dir_path / WEIGHTS_NAME) config.torchscript = True + config.torch_dtype = torch_dtype return cls._from_pretrained( model_id=save_dir_path, @@ -135,6 +186,7 @@ def _from_transformers( cache_dir=cache_dir, local_files_only=local_files_only, use_cache=use_cache, + model_dtype=torch_dtype, ) @classmethod @@ -213,6 +265,13 @@ def device(self) -> torch.device: def dtype(self) -> torch.dtype: return self._dtype + @property + def model_dtype(self): + logger.warning( + "access to the `model_dtype` attribute is deprecated and will be removed after v1.18.0, please use `_dtype` instead." + ) + return self._dtype + def to(self, device: Union[torch.device, str]): self._device = device if isinstance(device, torch.device) else torch.device(device) self.model.to(self._device) @@ -223,7 +282,7 @@ def can_generate(self): def _call_model(self, *args, **kwargs): try: - with torch.autocast(self.device.type, self.dtype): + with torch.autocast(self.device.type, self.dtype), torch.no_grad(): out = self.model(*args, **kwargs) except RuntimeError: out = self.model(*args, **kwargs) @@ -232,10 +291,12 @@ def _call_model(self, *args, **kwargs): def _init_warmup(self): # warmup, the first 2 forwards of an IPEX model include some preprocessing steps and # the results of the compute are unpredictable - use_cache = "past_key_values" in self.input_names - dummy_inputs = prepare_jit_inputs(self, self.export_feature, use_cache) - for _ in range(2): - self(**dummy_inputs) + # TODO : add warmup for IPEX exported model + if not self._is_ipex_exported: + use_cache = "past_key_values" in self.input_names + dummy_inputs = prepare_jit_inputs(self, self.export_feature, use_cache) + for _ in range(2): + self(**dummy_inputs) class IPEXModelForSequenceClassification(IPEXModel): @@ -334,10 +395,10 @@ def __init__( ): # Perform the initial warmup at the end of __init__ super().__init__(model, config, model_save_dir=model_save_dir, warmup=False) + GenerationMixin.__init__(self) model_type = config.model_type.replace("_", "-") self.normalized_config = NormalizedConfigManager.get_normalized_config_class(model_type)(config) - self.model_dtype = kwargs.get("model_dtype", self.dtype) self.use_cache = "past_key_values" in self.input_names if use_cache ^ self.use_cache: @@ -357,7 +418,15 @@ def __init__( ) except AttributeError: self.model_cls = get_model_class(self.config, AutoModelForCausalLM._model_mapping) - self._reorder_cache = self.model_cls._reorder_cache.__get__(self) + + if self._is_ipex_exported: + self._reorder_cache = _ipex_reorder_cache + else: + # Check if _reorder_cache is a static method + if isinstance(self.model_cls.__dict__["_reorder_cache"], staticmethod): + self._reorder_cache = self.model_cls._reorder_cache + else: + self._reorder_cache = self.model_cls._reorder_cache.__get__(self) if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon"}: self.prepare_inputs_for_generation = _prepare_inputs_for_generation_for_llama @@ -383,7 +452,25 @@ def _prepare_past_key_values(self, input_ids): else: num_attention_heads = self.normalized_config.num_attention_heads - if model_type == "bloom": + if self._is_ipex_exported: + # Indirect access kv cache has a different data layout compared with most transformers model, + # see https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache + beam_idx_tmp = torch.zeros( + (self.config.max_position_embeddings, input_ids.shape[0]), dtype=torch.long + ).contiguous() + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + ) + for i in range(num_layers) + ] + ) + return past_key_values + elif model_type == "bloom": shape_key = (batch_size * num_attention_heads, d_k, 0) shape_value = (batch_size * num_attention_heads, 0, d_k) key = torch.empty(size=shape_key, dtype=self.model_dtype, device=self._device) @@ -505,3 +592,23 @@ def _prepare_inputs_for_generation_for_llama( } ) return model_inputs + + +def _ipex_reorder_cache( + past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor +) -> Tuple[Tuple[torch.Tensor]]: + # Ipex patched model uses indirect access kv cache which has a different shape with other transformers models + if len(past_key_values[0]) == 4 and past_key_values[0][0].shape[-1] == 1: + for layer_past in past_key_values: + layer_past[3][layer_past[0].size(-2) - 1] = beam_idx + return past_key_values + elif len(past_key_values[0]) == 8: + for layer_past in past_key_values: + layer_past[3][layer_past[0].size(-2) - 1] = beam_idx + layer_past[7][layer_past[0].size(-2) - 1] = beam_idx + return past_key_values + else: + return tuple( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) + for layer_past in past_key_values + ) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 03b7d015d1..68119287d8 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -26,6 +26,7 @@ AutoModelForCausalLM, AutoModelForQuestionAnswering, AutoTokenizer, + GenerationConfig, PretrainedConfig, pipeline, set_seed, @@ -42,6 +43,8 @@ IPEXModelForSequenceClassification, IPEXModelForTokenClassification, ) +from optimum.intel.utils.import_utils import is_ipex_version +from optimum.utils.testing_utils import grid_parameters SEED = 42 @@ -216,6 +219,7 @@ class IPEXModelForCausalLMTest(unittest.TestCase): "mpt", "opt", ) + IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama",) GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.0 @@ -259,6 +263,41 @@ def test_pipeline(self, model_arch): self.assertEqual(pipe.device, model.device) self.assertTrue(all("This is a sample" in item["generated_text"] for item in outputs)) + @parameterized.expand( + grid_parameters( + { + "model_arch": IPEX_PATCHED_SUPPORTED_ARCHITECTURES, + "use_cache": [True, False], + } + ) + ) + @unittest.skipIf(is_ipex_version("<", "2.5.0"), reason="Only ipex version > 2.3.0 supports ipex model patching") + def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache) + self.assertEqual(model.use_cache, use_cache) + trasnformers_model = AutoModelForCausalLM.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.pad_token = tokenizer.eos_token + # Test with batch_size is 1 and 2. + texts = ["This is a sample", ["This is the first input", "This is the second input"]] + generation_configs = ( + GenerationConfig(max_new_tokens=4, num_beams=2, do_sample=True), + GenerationConfig(max_new_tokens=4, num_beams=4, do_sample=True), + GenerationConfig(max_new_tokens=4, num_beams=8, do_sample=True), + GenerationConfig(max_new_tokens=4, num_beams=32, do_sample=True), + GenerationConfig(max_new_tokens=4, do_sample=not use_cache, top_p=1.0, top_k=5, penalty_alpha=0.6), + GenerationConfig(max_new_tokens=4, do_sample=True, top_p=0.9, top_k=0), + ) + for text in texts: + tokens = tokenizer(text, padding=True, return_tensors="pt") + for generation_config in generation_configs: + outputs = model.generate(**tokens, generation_config=generation_config) + transformers_outputs = trasnformers_model.generate(**tokens, generation_config=generation_config) + self.assertIsInstance(outputs, torch.Tensor) + self.assertEqual(outputs, transformers_outputs) + def test_compare_with_and_without_past_key_values(self): model_id = "echarlaix/tiny-random-gpt2-torchscript" tokenizer = AutoTokenizer.from_pretrained(model_id) From be6666b8b15b7aec9ab8485b694d157040668516 Mon Sep 17 00:00:00 2001 From: Alexander Kozlov Date: Fri, 8 Mar 2024 16:07:15 +0400 Subject: [PATCH 13/30] Updates weight quantization section in the docs (#593) --- docs/source/optimization_ov.mdx | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 51067b0b64..088b78f0d3 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -82,7 +82,17 @@ from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig model = OVModelForCausalLM.from_pretrained( model_id, - export=True, + quantization_config=OVWeightQuantizationConfig(bits=4), +) +``` + +You can tune quantization parameters to achieve a better performance accuracy trade-off as follows: + +```python +from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig + +model = OVModelForCausalLM.from_pretrained( + model_id, quantization_config=OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, dataset="ptb"), ) ``` From 72b0630d6bcac3b6047735e52ce114ed2bfda0f2 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 8 Mar 2024 14:20:28 +0100 Subject: [PATCH 14/30] Remove accelerate and onnxruntime from required dependencies (#590) * Remove accelerate dependency * Add accelerate to import backend mapping * Add eval method to OVModels * add onnxruntime install for OV test * fix test expected int8 --- .github/workflows/test_openvino.yml | 2 +- optimum/intel/__init__.py | 37 +++++++++++++------ optimum/intel/openvino/__init__.py | 6 ++- optimum/intel/openvino/quantization.py | 14 +++++-- optimum/intel/utils/__init__.py | 1 + .../utils/dummy_openvino_and_nncf_objects.py | 6 +-- optimum/intel/utils/import_utils.py | 20 ++++++++++ setup.py | 9 +++-- tests/openvino/test_quantization.py | 20 +++++----- tests/openvino/test_stable_diffusion.py | 23 ++++-------- tests/openvino/test_training.py | 12 +++--- tests/openvino/utils_tests.py | 6 +-- 12 files changed, 96 insertions(+), 60 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index bf9460c75a..6d709eecfd 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -32,7 +32,7 @@ jobs: python -m pip install --upgrade pip # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install .[openvino,openvino-tokenizers,nncf,tests,diffusers] + pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime - name: Test with Pytest run: | pytest tests/openvino/ --ignore test_modeling_basic diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 93a4417bfc..59059d688d 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -18,6 +18,7 @@ from transformers.utils import OptionalDependencyNotAvailable, _LazyModule from .utils import ( + is_accelerate_available, is_diffusers_available, is_ipex_available, is_neural_compressor_available, @@ -29,6 +30,7 @@ _import_structure = { "openvino": [], + "utils.dummy_openvino_and_nncf_objects": [], } try: @@ -57,13 +59,19 @@ if not (is_openvino_available() and is_nncf_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - _import_structure["utils.dummy_openvino_and_nncf_objects"] = [ - "OVQuantizer", - "OVTrainer", - "OVTrainingArguments", - ] + _import_structure["utils.dummy_openvino_and_nncf_objects"].extend(["OVQuantizer", "OVTrainingArguments"]) +else: + _import_structure["openvino"].extend(["OVQuantizer", "OVTrainingArguments"]) + + +try: + if not (is_openvino_available() and is_nncf_available() and is_accelerate_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + _import_structure["utils.dummy_openvino_and_nncf_objects"].extend(["OVTrainer"]) else: - _import_structure["openvino"].extend(["OVQuantizer", "OVTrainer", "OVTrainingArguments"]) + _import_structure["openvino"].extend(["OVTrainer"]) + try: if not (is_openvino_available() and is_diffusers_available()): @@ -145,6 +153,7 @@ "INCSeq2SeqTrainer", "INCTrainer", ] + try: if not (is_neural_compressor_available() and is_diffusers_available()): raise OptionalDependencyNotAvailable() @@ -177,13 +186,17 @@ if not (is_openvino_available() and is_nncf_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from .utils.dummy_openvino_and_nncf_objects import ( - OVQuantizer, - OVTrainer, - OVTrainingArguments, - ) + from .utils.dummy_openvino_and_nncf_objects import OVQuantizer, OVTrainingArguments + else: + from .openvino import OVQuantizer, OVTrainingArguments + + try: + if not (is_openvino_available() and is_nncf_available() and is_accelerate_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_openvino_and_nncf_objects import OVTrainer else: - from .openvino import OVQuantizer, OVTrainer, OVTrainingArguments + from .openvino import OVTrainer try: if not (is_openvino_available() and is_diffusers_available()): diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index a6227615a2..1df932771a 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -14,7 +14,7 @@ import logging -from ..utils.import_utils import is_diffusers_available, is_nncf_available +from ..utils.import_utils import is_accelerate_available, is_diffusers_available, is_nncf_available from .utils import ( OV_DECODER_NAME, OV_DECODER_WITH_PAST_NAME, @@ -37,9 +37,11 @@ patch_torch_operators() from .quantization import OVQuantizer - from .trainer import OVTrainer from .training_args import OVTrainingArguments + if is_accelerate_available(): + from .trainer import OVTrainer + from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling import ( diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index d7b88f2be3..cd26f91f22 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -17,7 +17,7 @@ import logging import os from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union +from typing import Any, Callable, Dict, Optional, Tuple, Union import nncf import openvino @@ -56,8 +56,7 @@ if is_datasets_available(): - if TYPE_CHECKING: - from datasets import Dataset + from datasets import Dataset register_module(ignored_algorithms=[])(Conv1D) @@ -147,6 +146,7 @@ def __init__(self, model: transformers.PreTrainedModel, task: Optional[str] = No ) self.task = task or feature self.seed = seed + # TODO : deprecate input_names self.input_names = None signature = inspect.signature(self.model.forward) self._signature_columns = list(signature.parameters.keys()) @@ -526,9 +526,15 @@ def _get_calibration_dataloader( data_collator: Optional[DataCollator] = None, ) -> OVDataLoader: data_collator = data_collator if data_collator is not None else default_data_collator + + if not is_datasets_available() or not isinstance(calibration_dataset, Dataset): + logger.warning( + "`remove_unused_columns` set to `False` as calibration_dataset is not an instance of `datasets.Dataset`" + ) + remove_unused_columns = False + if remove_unused_columns: calibration_dataset = self._remove_unused_columns(calibration_dataset) - self.input_names = calibration_dataset.column_names generator = torch.Generator() generator.manual_seed(self.seed) sampler = RandomSampler(calibration_dataset, generator=generator) diff --git a/optimum/intel/utils/__init__.py b/optimum/intel/utils/__init__.py index 4e7522ee77..d77588f896 100644 --- a/optimum/intel/utils/__init__.py +++ b/optimum/intel/utils/__init__.py @@ -16,6 +16,7 @@ _neural_compressor_version, _torch_version, compare_versions, + is_accelerate_available, is_diffusers_available, is_ipex_available, is_neural_compressor_available, diff --git a/optimum/intel/utils/dummy_openvino_and_nncf_objects.py b/optimum/intel/utils/dummy_openvino_and_nncf_objects.py index 45c390aff2..8ae3135667 100644 --- a/optimum/intel/utils/dummy_openvino_and_nncf_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_nncf_objects.py @@ -27,14 +27,14 @@ def from_pretrained(cls, *args, **kwargs): class OVTrainer(metaclass=DummyObject): - _backends = ["openvino", "nncf"] + _backends = ["openvino", "nncf", "accelerate"] def __init__(self, *args, **kwargs): - requires_backends(self, ["openvino", "nncf"]) + requires_backends(self, ["openvino", "nncf", "accelerate"]) @classmethod def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["openvino", "nncf"]) + requires_backends(cls, ["openvino", "nncf", "accelerate"]) class OVQuantizer(metaclass=DummyObject): diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index f1fb486c4f..1d5ce25086 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -156,6 +156,16 @@ _datasets_available = False +_accelerate_available = importlib.util.find_spec("accelerate") is not None +_accelerate_version = "N/A" + +if _accelerate_available: + try: + _accelerate_version = importlib_metadata.version("accelerate") + except importlib_metadata.PackageNotFoundError: + _accelerate_available = False + + def is_transformers_available(): return _transformers_available @@ -196,6 +206,10 @@ def is_datasets_available(): return _datasets_available +def is_accelerate_available(): + return _accelerate_available + + # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319 def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str): """ @@ -317,6 +331,11 @@ def is_timm_version(operation: str, version: str): `pip install datasets`. Please note that you may need to restart your runtime after installation. """ +ACCELERATE_IMPORT_ERROR = """ +{0} requires the accelerate library but it was not found in your environment. You can install it with pip: +`pip install accelerate`. Please note that you may need to restart your runtime after installation. +""" + BACKENDS_MAPPING = OrderedDict( [ ("diffusers", (is_diffusers_available, DIFFUSERS_IMPORT_ERROR)), @@ -324,6 +343,7 @@ def is_timm_version(operation: str, version: str): ("nncf", (is_nncf_available, NNCF_IMPORT_ERROR)), ("openvino", (is_openvino_available, OPENVINO_IMPORT_ERROR)), ("neural_compressor", (is_neural_compressor_available, NEURAL_COMPRESSOR_IMPORT_ERROR)), + ("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)), ] ) diff --git a/setup.py b/setup.py index dd98548018..ac4056c30d 100644 --- a/setup.py +++ b/setup.py @@ -18,10 +18,11 @@ "datasets>=1.4.0", "sentencepiece", "scipy", - "accelerate", # transformers 4.29 require accelerate for PyTorch + "onnx", ] TESTS_REQUIRE = [ + "accelerate", "pytest", "parameterized", "Pillow", @@ -39,11 +40,11 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] EXTRAS_REQUIRE = { - "neural-compressor": ["neural-compressor>=2.2.0", "onnx", "onnxruntime<1.15.0"], - "openvino": ["openvino>=2023.3", "onnx", "onnxruntime", "nncf>=2.8.1"], + "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], + "openvino": ["openvino>=2023.3", "nncf>=2.8.1"], "openvino-tokenizers": ["openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.8.1"], - "ipex": ["intel-extension-for-pytorch", "onnx"], + "ipex": ["intel-extension-for-pytorch"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 0ef89ec8b8..a33e0339f3 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -154,16 +154,16 @@ class OVWeightCompressionTest(unittest.TestCase): # TODO : add models SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ( (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 70, 70), - (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 44, 46), + (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 44, 44), ) - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 64, 365),) - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 388),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 62, 365),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 385),) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS = ( - (OVModelForCausalLM, "hf-internal-testing/tiny-random-OPTForCausalLM", 16, 136), + (OVModelForCausalLM, "hf-internal-testing/tiny-random-OPTForCausalLM", 14, 136), ) SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ( - (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 44, 46), + (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 44, 44), ) LOAD_IN_4_BITS_SCOPE = ( @@ -171,7 +171,7 @@ class OVWeightCompressionTest(unittest.TestCase): OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", dict(bits=4, sym=False, group_size=-1, ratio=0.8), - 16, + 14, ), ( OVModelForCausalLM, @@ -182,13 +182,13 @@ class OVWeightCompressionTest(unittest.TestCase): group_size=32, ignored_scope={"names": ["__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul"]}, ), - 6, + 4, ), ( OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", dict(bits=4, sym=False, group_size=-1, ratio=0.8, all_layers=True), - 22, + 18, ), ( OVModelForCausalLM, @@ -201,7 +201,7 @@ class OVWeightCompressionTest(unittest.TestCase): sensitivity_metric="mean_activation_magnitude", dataset="ptb", ), - 16, + 14, ), ( OVModelForCausalLM, @@ -215,7 +215,7 @@ class OVWeightCompressionTest(unittest.TestCase): dataset="ptb", awq=True, ), - 16, + 14, ), ) diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py index d8cef2e027..ab6f6f21a6 100644 --- a/tests/openvino/test_stable_diffusion.py +++ b/tests/openvino/test_stable_diffusion.py @@ -28,7 +28,6 @@ from diffusers.utils import load_image from diffusers.utils.testing_utils import floats_tensor from openvino.runtime.ie_api import CompiledModel -from packaging.version import Version, parse from parameterized import parameterized from utils_tests import MODEL_NAMES, SEED @@ -46,13 +45,8 @@ OVModelVaeDecoder, OVModelVaeEncoder, ) -from optimum.onnxruntime import ( - ORTStableDiffusionImg2ImgPipeline, - ORTStableDiffusionInpaintPipeline, - ORTStableDiffusionXLImg2ImgPipeline, - ORTStableDiffusionXLPipeline, -) -from optimum.utils.import_utils import _diffusers_version +from optimum.intel.utils.import_utils import is_diffusers_version +from optimum.utils.import_utils import is_onnxruntime_available F32_CONFIG = {"INFERENCE_PRECISION_HINT": "f32"} @@ -167,7 +161,6 @@ def generate_inputs(self, height=128, width=128, batch_size=1): class OVStableDiffusionImg2ImgPipelineTest(OVStableDiffusionPipelineBaseTest): SUPPORTED_ARCHITECTURES = ("stable-diffusion",) MODEL_CLASS = OVStableDiffusionImg2ImgPipeline - ORT_MODEL_CLASS = ORTStableDiffusionImg2ImgPipeline TASK = "image-to-image" @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -298,11 +291,13 @@ def test_height_width_properties(self, model_arch: str): class OVStableDiffusionInpaintPipelineTest(OVStableDiffusionPipelineBaseTest): SUPPORTED_ARCHITECTURES = ("stable-diffusion",) MODEL_CLASS = OVStableDiffusionInpaintPipeline - ORT_MODEL_CLASS = ORTStableDiffusionInpaintPipeline TASK = "inpaint" @parameterized.expand(SUPPORTED_ARCHITECTURES) + @unittest.skipIf(not is_onnxruntime_available(), "this test requires onnxruntime") def test_compare_diffusers_pipeline(self, model_arch: str): + from optimum.onnxruntime import ORTStableDiffusionInpaintPipeline + model_id = MODEL_NAMES[model_arch] pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) batch_size, num_images, height, width = 1, 1, 64, 64 @@ -329,7 +324,7 @@ def test_compare_diffusers_pipeline(self, model_arch: str): outputs = pipeline(**inputs, latents=latents).images self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - ort_pipeline = self.ORT_MODEL_CLASS.from_pretrained(model_id, export=True) + ort_pipeline = ORTStableDiffusionInpaintPipeline.from_pretrained(model_id, export=True) ort_outputs = ort_pipeline(**inputs, latents=latents).images self.assertTrue(np.allclose(outputs, ort_outputs, atol=1e-1)) @@ -358,7 +353,6 @@ def generate_inputs(self, height=128, width=128, batch_size=1): class OVtableDiffusionXLPipelineTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ("stable-diffusion-xl",) MODEL_CLASS = OVStableDiffusionXLPipeline - ORT_MODEL_CLASS = ORTStableDiffusionXLPipeline PT_MODEL_CLASS = StableDiffusionXLPipeline TASK = "text-to-image" @@ -444,7 +438,6 @@ def test_num_images_per_prompt_static_model(self, model_arch: str): class OVStableDiffusionXLImg2ImgPipelineTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ("stable-diffusion-xl", "stable-diffusion-xl-refiner") MODEL_CLASS = OVStableDiffusionXLImg2ImgPipeline - ORT_MODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline PT_MODEL_CLASS = StableDiffusionXLImg2ImgPipeline TASK = "image-to-image" @@ -489,7 +482,7 @@ class OVLatentConsistencyModelPipelineTest(unittest.TestCase): TASK = "text-to-image" @parameterized.expand(SUPPORTED_ARCHITECTURES) - @unittest.skipIf(parse(_diffusers_version) <= Version("0.21.4"), "not supported with this diffusers version") + @unittest.skipIf(is_diffusers_version("<=", "0.21.4"), "not supported with this diffusers version") def test_compare_to_diffusers(self, model_arch: str): ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG) self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder) @@ -532,7 +525,7 @@ def test_compare_to_diffusers(self, model_arch: str): self.assertEqual(pipeline.device.type, ov_pipeline.device) @parameterized.expand(SUPPORTED_ARCHITECTURES) - @unittest.skipIf(parse(_diffusers_version) <= Version("0.21.4"), "not supported with this diffusers version") + @unittest.skipIf(is_diffusers_version("<=", "0.21.4"), "not supported with this diffusers version") def test_num_images_per_prompt_static_model(self, model_arch: str): model_id = MODEL_NAMES[model_arch] pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=False) diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py index 937c0bf3f5..80298faf2b 100644 --- a/tests/openvino/test_training.py +++ b/tests/openvino/test_training.py @@ -365,7 +365,7 @@ def tearDown(self): "default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=44, + expected_fake_quantize=34, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -376,7 +376,7 @@ def tearDown(self): CUSTOMIZED_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT, ], - expected_fake_quantize=44, + expected_fake_quantize=34, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -385,7 +385,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=44, + expected_fake_quantize=34, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -397,7 +397,7 @@ def tearDown(self): CUSTOMIZED_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT, ], - expected_fake_quantize=44, + expected_fake_quantize=34, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -749,7 +749,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): "quantization,structured_movement_sparsity": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-Wav2Vec2Model", nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2], - expected_fake_quantize=48, + expected_fake_quantize=40, expected_int8=30, expected_binary_masks=48, compression_metrics=["compression_loss"], @@ -766,7 +766,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): model_id="hf-internal-testing/tiny-random-Wav2Vec2Model", teacher_model_id="hf-internal-testing/tiny-random-Wav2Vec2Model", nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2], - expected_fake_quantize=48, + expected_fake_quantize=40, expected_int8=30, expected_binary_masks=48, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 8fabb34e38..04049172d3 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -102,12 +102,12 @@ SEED = 42 _ARCHITECTURES_TO_EXPECTED_INT8 = { - "bert": (70,), + "bert": (68,), "roberta": (68,), "albert": (84,), "vit": (64,), "blenderbot": (70,), - "gpt2": (46,), + "gpt2": (44,), "wav2vec2": (34,), "distilbert": (66,), "t5": (64, 104, 84), @@ -116,7 +116,7 @@ "stable-diffusion-xl-refiner": (366, 34, 42, 66), } -_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (64, 477)} +_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (62, 477)} def get_num_quantized_nodes(ov_model): From e6ee88cf21d4ba582e25ce04e1dac0c9789774e1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 11 Mar 2024 11:04:56 +0100 Subject: [PATCH 15/30] Fix OpenVINO image classification examples (#598) --- .../run_image_classification.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/examples/openvino/image-classification/run_image_classification.py b/examples/openvino/image-classification/run_image_classification.py index 8a7c009e46..5f98d95cb5 100644 --- a/examples/openvino/image-classification/run_image_classification.py +++ b/examples/openvino/image-classification/run_image_classification.py @@ -151,12 +151,12 @@ class ModelArguments: metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."}) - use_auth_token: bool = field( - default=False, + token: str = field( + default=None, metadata={ "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." ) }, ) @@ -239,8 +239,7 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, - task="image-classification", - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) else: data_files = {} @@ -252,7 +251,6 @@ def main(): "imagefolder", data_files=data_files, cache_dir=model_args.cache_dir, - task="image-classification", ) # If we don't have a validation split, split off a percentage of train as validation. @@ -287,7 +285,7 @@ def compute_metrics(p): finetuning_task="image-classification", cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) model = AutoModelForImageClassification.from_pretrained( model_args.model_name_or_path, @@ -295,7 +293,7 @@ def compute_metrics(p): config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) @@ -311,7 +309,7 @@ def compute_metrics(p): model_args.feature_extractor_name or model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, + token=model_args.token, ) # Define torchvision transforms to be applied to each image. From 5c683a3cb53c195d485d5602f447af43107d6a9d Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Mon, 11 Mar 2024 19:24:47 +0400 Subject: [PATCH 16/30] Fix weights compression for OPenVINO models (#596) * hot fix for weights compression * rewrite mcok tests --- optimum/intel/openvino/modeling_decoder.py | 8 +-- tests/openvino/test_quantization.py | 64 ++++++++++++++++------ 2 files changed, 50 insertions(+), 22 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 92a2ce436d..3d9671caf1 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -261,10 +261,10 @@ def _from_transformers( task = task + "-with-past" # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size - if load_in_8bit is None or not quantization_config: - ov_config = None + if load_in_8bit is None and not quantization_config: + ov_export_config = None else: - ov_config = OVConfig(dtype="fp32") + ov_export_config = OVConfig(dtype="fp32") stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache) @@ -279,7 +279,7 @@ def _from_transformers( local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, - ov_config=ov_config, + ov_config=ov_export_config, stateful=stateful, ) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index a33e0339f3..57c45df6ec 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -459,36 +459,64 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type): self.assertEqual(0, num_int8) def test_ovmodel_load_large_model_with_default_compressed_weights(self): - with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch: - model_mixin_patch.num_parameters.return_value = 2e9 + with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters: + mock_tensor = unittest.mock.Mock() + mock_tensor.numel = lambda: 2000000000 + mock_tensor.requires_grad = True + model_parameters.return_value = [mock_tensor] with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: _ = OVModelForCausalLM.from_pretrained( MODEL_NAMES["llama"], export=True, compile=False, use_cache=False ) - saving_params = { - "model": unittest.mock.ANY, - "path": unittest.mock.ANY, - "compression_option": "int8", - "compression_ratio": None, - } - save_model_patch.aasert_called_with(saving_params) + save_model_patch.assert_called_with( + unittest.mock.ANY, unittest.mock.ANY, ov_config=OVConfig(quantization_config={"bits": 8}) + ) def test_ovmodel_load_large_model_with_uncompressed_weights(self): - with unittest.mock.patch("transformers.modeling_utils.ModuleUtilsMixin") as model_mixin_patch: - model_mixin_patch.num_parameters.return_value = 2e9 + with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters: + mock_tensor = unittest.mock.Mock() + mock_tensor.numel = lambda: 2000000000 + mock_tensor.requires_grad = True + model_parameters.return_value = [mock_tensor] with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: _ = OVModelForCausalLM.from_pretrained( MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False ) - saving_params = { - "model": unittest.mock.ANY, - "path": unittest.mock.ANY, - "compression_option": "fp32", - "compression_ratio": None, - } - save_model_patch.aasert_called_with(saving_params) + save_model_patch.assert_called_with( + unittest.mock.ANY, unittest.mock.ANY, ov_config=OVConfig(dtype="fp32") + ) + + def test_ovmodel_load_large_model_with_additional_quantization_config(self): + with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters: + mock_tensor = unittest.mock.Mock() + mock_tensor.numel = lambda: 2000000000 + mock_tensor.requires_grad = True + with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: + with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: + with unittest.mock.patch("nncf.compress_weights") as compress_weights_patch: + _ = OVModelForCausalLM.from_pretrained( + MODEL_NAMES["llama"], + export=True, + compile=False, + use_cache=False, + quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8), + ) + # quantization will be performed later, using load_model + save_model_patch.assert_called_with( + unittest.mock.ANY, unittest.mock.ANY, ov_config=OVConfig(dtype="fp32") + ) + compression_params = { + "mode": nncf.CompressWeightsMode.INT4_SYM, + "ratio": 0.8, + "group_size": -1, + "all_layers": None, + "sensitivity_metric": None, + "dataset": None, + "ignored_scope": None, + } + compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) class OVQuantizerQATest(unittest.TestCase): From 36459a1f0dac75e5b70b792c65dba97fc5b9bb6b Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 11 Mar 2024 17:37:19 +0100 Subject: [PATCH 17/30] Fix default ov config (#600) --- optimum/intel/openvino/modeling.py | 4 ++-- optimum/intel/openvino/modeling_base.py | 4 ++-- optimum/intel/openvino/modeling_base_seq2seq.py | 4 ++-- optimum/intel/openvino/modeling_decoder.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index 7831305d5f..357ca94c07 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -434,8 +434,8 @@ def _from_transformers( save_dir = TemporaryDirectory() save_dir_path = Path(save_dir.name) - # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size - if load_in_8bit is None or not quantization_config: + # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size + if load_in_8bit is None and not quantization_config: ov_config = None else: ov_config = OVConfig(dtype="fp32") diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index af00f7a06e..7ab99aab42 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -314,8 +314,8 @@ def _from_transformers( save_dir = TemporaryDirectory() save_dir_path = Path(save_dir.name) - # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size - if load_in_8bit is None or not quantization_config: + # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size + if load_in_8bit is None and not quantization_config: ov_config = None else: ov_config = OVConfig(dtype="fp32") diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 3cb43e61b8..28e112c4d9 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -258,8 +258,8 @@ def _from_transformers( if use_cache: task = task + "-with-past" - # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size - if load_in_8bit is None or not quantization_config: + # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size + if load_in_8bit is None and not quantization_config: ov_config = None else: ov_config = OVConfig(dtype="fp32") diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 3d9671caf1..edc88d02cb 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -260,7 +260,7 @@ def _from_transformers( if use_cache: task = task + "-with-past" - # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size + # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size if load_in_8bit is None and not quantization_config: ov_export_config = None else: diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 5e8a0cdc59..a985f43d7c 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -321,8 +321,8 @@ def _from_transformers( save_dir = TemporaryDirectory() save_dir_path = Path(save_dir.name) - # If load_in_8bit or quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size - if load_in_8bit is None or not quantization_config: + # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size + if load_in_8bit is None and not quantization_config: ov_config = None else: ov_config = OVConfig(dtype="fp32") From d2d2d5dd4d384f3fe3c4b1a6455d47273e2e3384 Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Tue, 12 Mar 2024 11:20:13 +0100 Subject: [PATCH 18/30] Add warning for transformers>=4.38 and OpenVINO 2024.0 (#599) * Add warning for transformers>=4.38 and OpenVINO 2024.0 * Use is_openvino_version to compare versions * Show version warning only for llama and gpt-bigcode * Fix style, show OpenVINO version * Include affected model types in warning message --- optimum/exporters/openvino/model_patcher.py | 23 +++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index aea57161e2..91dc48df05 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -15,21 +15,24 @@ import logging as log from optimum.intel.utils.import_utils import ( + _openvino_version, _torch_version, _transformers_version, + is_openvino_version, is_torch_version, is_transformers_version, ) def patch_model_with_bettertransformer(model): + COLOR_RED = "\033[1;31m" + COLOR_RESET = "\033[0m" + # check that the model has not yet been pathced if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True: return model if is_transformers_version("<", "4.36") or is_torch_version("<", "2.1.1"): - COLOR_RED = "\033[1;31m" - COLOR_RESET = "\033[0m" log.warn( COLOR_RED + "[WARNING] For good performance with stateful models, transformers>=4.36.2 and PyTorch>=2.1.1 are required. " @@ -39,6 +42,22 @@ def patch_model_with_bettertransformer(model): + COLOR_RESET ) + if ( + getattr(model.config, "model_type") in {"gpt_bigcode", "llama"} + and is_transformers_version(">=", "4.38") + and is_openvino_version("<", "2024.1.0-14612") + ): + # display commit-id only when a nightly/prerelease of OpenVINO is installed. + display_version = ( + _openvino_version.split("-")[0] if is_openvino_version("<=", "2024.0.0-14509") else _openvino_version + ) + log.warn( + COLOR_RED + f"[WARNING] Stateful models are not supported for Llama and GPTBigCode with Transformers " + f"{_transformers_version} and OpenVINO {display_version}. For good performance, consider using a nightly OpenVINO build: " + "https://docs.openvino.ai/2024/get-started/install-openvino.html. For models that do not need transformers " + "4.38+, it is also an option to downgrade transformers: `pip install transformers==4.37.2`" + COLOR_RESET + ) + # model already has required SDPA implementation if getattr(model, "_supports_sdpa", False) and getattr(model.config, "_attn_implementation", "eager") == "sdpa": return model From 6faf44562d762ca66bb1db93c9c2703e37cf5020 Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Tue, 12 Mar 2024 10:40:33 +0000 Subject: [PATCH 19/30] Add hybrid quantization for StableDiffusion pipelines (#584) * Add hybrid quantization for StableDiffusion pipelines * apply black * fix tests * fix ruff * fix lcm bug * apply review comments * rework dataset processing * Add doc * remove SDXL test * Apply comments * reformat --- docs/source/optimization_ov.mdx | 17 ++++ optimum/intel/openvino/configuration.py | 37 ++++--- optimum/intel/openvino/modeling_decoder.py | 3 +- optimum/intel/openvino/modeling_diffusion.py | 101 ++++++++++++++++++- optimum/intel/openvino/quantization.py | 95 ++++++++++++++++- optimum/intel/openvino/utils.py | 7 ++ tests/openvino/test_quantization.py | 45 ++++++++- tests/openvino/utils_tests.py | 6 +- 8 files changed, 283 insertions(+), 28 deletions(-) diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 088b78f0d3..70c98f14f7 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -69,6 +69,23 @@ from optimum.intel import OVModelForCausalLM model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) ``` +## Hybrid quantization + +Traditional optimization methods like post-training 8-bit quantization do not work well for Stable Diffusion models and can lead to poor generation results. On the other hand, weight compression does not improve performance significantly when applied to Stable Diffusion models, as the size of activations is comparable to weights. +The UNet model takes up most of the overall execution time of the pipeline. Thus, optimizing just one model brings substantial benefits in terms of inference speed while keeping acceptable accuracy without fine-tuning. Quantizing the rest of the diffusion pipeline does not significantly improve inference performance but could potentially lead to substantial degradation of accuracy. +Therefore, the proposal is to apply quantization in *hybrid mode* for the UNet model and weight-only quantization for the rest of the pipeline components. The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and activations of other layers, facilitating accuracy preservation post-optimization while reducing the model size. +The `quantization_config` is utilized to define optimization parameters for optimizing the Stable Diffusion pipeline. To enable hybrid quantization, specify the quantization dataset in the `quantization_config`. Otherwise, weight-only quantization to a specified data type (8 tr 4 bits) is applied to UNet model. + +```python +from optimum.intel import OVStableDiffusionPipeline, OVWeightQuantizationConfig + +model = OVStableDiffusionPipeline.from_pretrained( + model_id, + export=True, + quantization_config=OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions"), +) +``` + `load_in_8bit` is enabled by default for the models larger than 1 billion parameters. diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 8ddd005279..40a60bb58e 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -167,7 +167,7 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin): bits (`int`, defaults to 8): The number of bits to quantize to. - sym (`bool`, *optional*, defaults to `False`): + sym (`bool`, defaults to `False`): Whether to use symetric quantization. tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): The tokenizer used to process the dataset. You can pass either: @@ -177,23 +177,24 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin): user or organization name, like `dbmdz/bert-base-german-cased`. - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. - dataset (`Union[List[str]]`, *optional*): - The dataset used for data-aware compression. You can provide your own dataset in a list of string or just use the - the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] - group_size (`int`, *optional*, defaults to 128): - The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. - ratio (`float`, *optional*, defaults to 1.0): + dataset (`str or List[str]`, *optional*): + The dataset used for data-aware compression or quantization with NNCF. You can provide your own dataset + in a list of strings or just use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs + or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models. + ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM and the rest to INT8_ASYM). + group_size (`int`, *optional*): + The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. all_layers (`bool`, *optional*): Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion. - sensitivity_metric (`nncf.SensitivityMetric`, *optional*): + sensitivity_metric (`str`, *optional*): The sensitivity metric for assigning quantization precision to layers. In order to preserve the accuracy of the model, the more sensitive layers receives a higher precision. - awq (`bool`, *optional*): - Enables AWQ method to unify weight ranges and improve overall model accuracy. - ignored_scope (`nncf.IgnoredScope`, *optional*): + ignored_scope (`dict`, *optional*): An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization. + num_samples (`int`, *optional*): + The maximum number of samples composing the calibration dataset. """ @@ -202,12 +203,13 @@ def __init__( bits: int = 8, sym: bool = False, tokenizer: Optional[Any] = None, - dataset: Optional[str] = None, + dataset: Optional[Union[str, List[str]]] = None, ratio: float = 1.0, group_size: Optional[int] = None, all_layers: Optional[bool] = None, sensitivity_metric: Optional[str] = None, ignored_scope: Optional[dict] = None, + num_samples: Optional[int] = None, **kwargs, ): self.bits = bits @@ -219,6 +221,7 @@ def __init__( self.all_layers = all_layers self.sensitivity_metric = sensitivity_metric self.ignored_scope = ignored_scope + self.num_samples = num_samples self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release self.post_init() @@ -231,10 +234,16 @@ def post_init(self): if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: raise ValueError("`group_size` must be greater than 0 or equal to -1") if self.dataset is not None and isinstance(self.dataset, str): - if self.dataset not in ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]: + llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"] + stable_diffusion_datasets = [ + "conceptual_captions", + "laion/220k-GPT4Vision-captions-from-LIVIS", + "laion/filtered-wit", + ] + if self.dataset not in llm_datasets + stable_diffusion_datasets: raise ValueError( f"""You have entered a string value for dataset. You can only choose between - ['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}""" + {llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}""" ) if self.bits not in [4, 8]: diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index edc88d02cb..53aa05bc5a 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -635,7 +635,8 @@ def _from_pretrained( # from optimum.gptq.utils import get_seqlen # seqlen = get_seqlen(causal_model) - dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32) + nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 + dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) quantization_config = copy.deepcopy(quantization_config) quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index a985f43d7c..c0588a6f11 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -16,6 +16,7 @@ import logging import os import shutil +from copy import deepcopy from pathlib import Path from tempfile import TemporaryDirectory, gettempdir from typing import Any, Dict, List, Optional, Union @@ -57,7 +58,13 @@ from .configuration import OVConfig, OVWeightQuantizationConfig from .loaders import OVTextualInversionLoaderMixin from .modeling_base import OVBaseModel -from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME, _print_compiled_model_properties +from .utils import ( + ONNX_WEIGHTS_NAME, + OV_TO_NP_TYPE, + OV_XML_FILE_NAME, + PREDEFINED_SD_DATASETS, + _print_compiled_model_properties, +) core = Core() @@ -274,9 +281,19 @@ def _from_pretrained( kwargs[name] = load_method(new_model_save_dir) quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) - unet = cls.load_model( - new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, quantization_config - ) + + unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name + if quantization_config is not None and quantization_config.dataset is not None: + # load the UNet model uncompressed to apply hybrid quantization further + unet = cls.load_model(unet_path) + # Apply weights compression to other `components` without dataset + weight_quantization_params = { + param: value for param, value in quantization_config.__dict__.items() if param != "dataset" + } + weight_quantization_config = OVWeightQuantizationConfig.from_dict(weight_quantization_params) + else: + weight_quantization_config = quantization_config + unet = cls.load_model(unet_path, weight_quantization_config) components = { "vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, @@ -286,11 +303,29 @@ def _from_pretrained( } for key, value in components.items(): - components[key] = cls.load_model(value, quantization_config) if value.is_file() else None + components[key] = cls.load_model(value, weight_quantization_config) if value.is_file() else None if model_save_dir is None: model_save_dir = new_model_save_dir + if quantization_config is not None and quantization_config.dataset is not None: + sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs) + + supported_pipelines = ( + OVStableDiffusionPipeline, + OVStableDiffusionXLPipeline, + OVLatentConsistencyModelPipeline, + ) + if not isinstance(sd_model, supported_pipelines): + raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}") + + nsamples = quantization_config.num_samples if quantization_config.num_samples else 200 + unet_inputs = sd_model._prepare_unet_inputs(quantization_config.dataset, nsamples) + + from .quantization import _hybrid_quantization + + unet = _hybrid_quantization(sd_model.unet.model, weight_quantization_config, dataset=unet_inputs) + return cls( unet=unet, config=config, @@ -300,6 +335,62 @@ def _from_pretrained( **kwargs, ) + def _prepare_unet_inputs( + self, + dataset: Union[str, List[Any]], + num_samples: int, + height: Optional[int] = None, + width: Optional[int] = None, + seed: Optional[int] = 42, + **kwargs, + ) -> Dict[str, Any]: + self.compile() + + size = self.unet.config.get("sample_size", 64) * self.vae_scale_factor + height = height or min(size, 512) + width = width or min(size, 512) + + if isinstance(dataset, str): + dataset = deepcopy(dataset) + available_datasets = PREDEFINED_SD_DATASETS.keys() + if dataset not in available_datasets: + raise ValueError( + f"""You have entered a string value for dataset. You can only choose between + {list(available_datasets)}, but the {dataset} was found""" + ) + + from datasets import load_dataset + + dataset_metadata = PREDEFINED_SD_DATASETS[dataset] + dataset = load_dataset(dataset, split=dataset_metadata["split"], streaming=True).shuffle(seed=seed) + input_names = dataset_metadata["inputs"] + dataset = dataset.select_columns(list(input_names.values())) + + def transform_fn(data_item): + return {inp_name: data_item[column] for inp_name, column in input_names.items()} + + else: + + def transform_fn(data_item): + return data_item if isinstance(data_item, (list, dict)) else [data_item] + + from .quantization import InferRequestWrapper + + calibration_data = [] + self.unet.request = InferRequestWrapper(self.unet.request, calibration_data) + + for inputs in dataset: + inputs = transform_fn(inputs) + if isinstance(inputs, dict): + self.__call__(**inputs, height=height, width=width) + else: + self.__call__(*inputs, height=height, width=width) + if len(calibration_data) > num_samples: + break + + self.unet.request = self.unet.request.request + return calibration_data[:num_samples] + @classmethod def _from_transformers( cls, diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index cd26f91f22..c46f29092b 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -16,6 +16,7 @@ import inspect import logging import os +from collections import deque from pathlib import Path from typing import Any, Callable, Dict, Optional, Tuple, Union @@ -24,6 +25,7 @@ import torch import transformers from nncf import CompressWeightsMode, IgnoredScope, NNCFConfig, SensitivityMetric +from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters from nncf.torch import create_compressed_model, register_default_init_args, register_module from nncf.torch.dynamic_graph.io_handling import wrap_nncf_model_inputs_with_objwalk from nncf.torch.initialization import PTInitializingDataLoader @@ -550,7 +552,7 @@ def _remove_unused_columns(self, dataset: "Dataset"): def _weight_only_quantization( model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict] -): +) -> openvino.runtime.Model: config = quantization_config if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) @@ -564,7 +566,8 @@ def _weight_only_quantization( from optimum.gptq.data import get_dataset, prepare_dataset - dataset = get_dataset(config.dataset, tokenizer, seqlen=32) + nsamples = config.num_samples if config.num_samples else 128 + dataset = get_dataset(config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) sensitivity_metric = None @@ -590,4 +593,92 @@ def _weight_only_quantization( # awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0 ignored_scope=ignored_scope, dataset=dataset, + # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 ) + + +def _get_operation_const_op(operation, const_port_id: int): + node = operation.input_value(const_port_id).get_node() + queue = deque([node]) + constant_node = None + allowed_propagation_types_list = ["Convert", "FakeQuantize", "Reshape"] + + while len(queue) != 0: + curr_node = queue.popleft() + if curr_node.get_type_name() == "Constant": + constant_node = curr_node + break + if len(curr_node.inputs()) == 0: + break + if curr_node.get_type_name() in allowed_propagation_types_list: + queue.append(curr_node.input_value(0).get_node()) + + return constant_node + + +def _is_embedding(node) -> bool: + allowed_types_list = ["f16", "f32", "f64"] + const_port_id = 0 + input_tensor = node.input_value(const_port_id) + if input_tensor.get_element_type().get_type_name() in allowed_types_list: + const_node = _get_operation_const_op(node, const_port_id) + if const_node is not None: + return True + + return False + + +def _collect_ops_with_weights(model): + ops_with_weights = [] + for op in model.get_ops(): + if op.get_type_name() == "MatMul": + constant_node_0 = _get_operation_const_op(op, const_port_id=0) + constant_node_1 = _get_operation_const_op(op, const_port_id=1) + if constant_node_0 or constant_node_1: + ops_with_weights.append(op.get_friendly_name()) + if op.get_type_name() == "Gather" and _is_embedding(op): + ops_with_weights.append(op.get_friendly_name()) + + return ops_with_weights + + +def _hybrid_quantization( + model: openvino.runtime.Model, quantization_config: OVWeightQuantizationConfig, dataset: Dict[str, Any] +) -> openvino.runtime.Model: + """ + Quantize a model in hybrid mode with NNCF which means that we quantize: + weights of MatMul and Embedding layers and activations of other layers. + The optimization specifications defined in `quantization_config`. + + Args: + model (`openvino.runtime.Model`): + The OpenVINO Runtime model for applying hybrid quantization. + quantization_config (`OVWeightQuantizationConfig`): + The configuration containing the parameters related to quantization. + dataset (`Dict[str, Any]`): + The dataset used for hybrid quantization. + Returns: + The OpenVINO Runtime model with applied hybrid quantization. + """ + ops_to_compress = _collect_ops_with_weights(model) + + ignored_scope = quantization_config.ignored_scope if isinstance(quantization_config.ignored_scope, dict) else {} + ptq_ignored_scope = nncf.IgnoredScope(**ignored_scope) + ptq_ignored_scope.names += ops_to_compress + + wc_quantization_config = copy.deepcopy(quantization_config) + wc_quantization_config.ignored_scope = ignored_scope + wc_quantization_config.ignored_scope["types"] = ignored_scope.get("types", []) + ["Convolution"] + compressed_model = _weight_only_quantization(model, wc_quantization_config) + + subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 + quantized_model = nncf.quantize( + model=compressed_model, + calibration_dataset=nncf.Dataset(dataset), + model_type=nncf.ModelType.TRANSFORMER, + ignored_scope=ptq_ignored_scope, + # The SQ algo should be disabled for MatMul nodes because their weights are already compressed + advanced_parameters=nncf.AdvancedQuantizationParameters(AdvancedSmoothQuantParameters(matmul=-1)), + subset_size=subset_size, + ) + return quantized_model diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 49aec81e57..cbcc696393 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -99,6 +99,13 @@ } +PREDEFINED_SD_DATASETS = { + "conceptual_captions": {"split": "train", "inputs": {"prompt": "caption"}}, + "laion/220k-GPT4Vision-captions-from-LIVIS": {"split": "train", "inputs": {"prompt": "caption"}}, + "laion/filtered-wit": {"split": "train", "inputs": {"prompt": "caption"}}, +} + + def use_external_data_format(num_parameters: int) -> bool: """ Returns whether or not the model requires using external data format for the ONNX export diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 57c45df6ec..c7fb00e12d 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -39,6 +39,7 @@ from optimum.intel import ( OVConfig, + OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, OVModelForFeatureExtraction, @@ -157,10 +158,10 @@ class OVWeightCompressionTest(unittest.TestCase): (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 44, 44), ) - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 62, 365),) - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 385),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 62, 86),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 148),) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS = ( - (OVModelForCausalLM, "hf-internal-testing/tiny-random-OPTForCausalLM", 14, 136), + (OVModelForCausalLM, "hf-internal-testing/tiny-random-OPTForCausalLM", 14, 50), ) SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ( (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 44, 44), @@ -233,6 +234,12 @@ class OVWeightCompressionTest(unittest.TestCase): (OVStableDiffusionXLPipeline, "stable-diffusion-xl"), ) + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = ( + (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), + (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331), + (OVLatentConsistencyModelPipeline, "latent-consistency", 50, 135), + ) + IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") DEFAULT_INT4_CONFIG = {"bits": 4, "sym": True, "group_size": 64, "all_layers": True} @@ -352,6 +359,38 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_ov_int8[i], num_int8) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION) + def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8): + model_id = MODEL_NAMES[model_type] + quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2) + with tempfile.TemporaryDirectory() as tmp_dir: + model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) + + num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet) + self.assertEqual(expected_num_fake_quantize, num_fake_quantize) + self.assertEqual(expected_ov_int8, num_int8) + self.assertEqual(0, num_int4) + + model.save_pretrained(tmp_dir) + + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:]) + def test_ovmodel_hybrid_quantization_with_custom_dataset( + self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8 + ): + model_id = MODEL_NAMES[model_type] + dataset = [ + "dream rose covered with clean crystal, sharp edges, transparent, beautiful, highly detailed, high render" + ] + model = model_cls.from_pretrained( + model_id, + export=True, + quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, num_samples=3), + ) + num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet) + self.assertEqual(expected_num_fake_quantize, num_fake_quantize) + self.assertEqual(expected_ov_int8, num_int8) + self.assertEqual(0, num_int4) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS) @unittest.mock.patch.dict( "optimum.intel.openvino.configuration._DEFAULT_4BIT_CONFIGS", {"facebook/opt-125m": DEFAULT_INT4_CONFIG} diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 04049172d3..97c8a92836 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -116,7 +116,7 @@ "stable-diffusion-xl-refiner": (366, 34, 42, 66), } -_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (62, 477)} +_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (62, 86)} def get_num_quantized_nodes(ov_model): @@ -127,8 +127,8 @@ def get_num_quantized_nodes(ov_model): if "FakeQuantize" in elem.name: num_fake_quantize += 1 for i in range(elem.get_output_size()): - if "8" in elem.get_output_element_type(i).get_type_name(): + if elem.get_output_element_type(i).get_type_name() in ["i8", "u8"]: num_int8 += 1 - if "4" in elem.get_output_element_type(i).get_type_name(): + if elem.get_output_element_type(i).get_type_name() in ["i4", "u4"]: num_int4 += 1 return num_fake_quantize, num_int8, num_int4 From 358f389cc0d5ba1206e77d900ed9f02b1887ee42 Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Tue, 12 Mar 2024 18:40:00 +0100 Subject: [PATCH 20/30] Show device name in _print_compiled_model_properties (#541) * Show device name in _print_compiled_model_properties Enable CACHE_DIR also for devices like "GPU:0" * Update optimum/intel/openvino/modeling_seq2seq.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Change check for gpu device --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/intel/openvino/modeling_base.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 2 +- optimum/intel/openvino/modeling_seq2seq.py | 4 ++-- optimum/intel/openvino/utils.py | 8 +++++++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 7ab99aab42..15f1fc4f1c 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -388,7 +388,7 @@ def compile(self): if ( "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()) - and self._device.lower() == "gpu" + and "gpu" in self._device.lower() ): # Set default CACHE_DIR only if it is not set, if the model is not in a temporary directory, and device is GPU cache_dir = Path(self.model_save_dir).joinpath("model_cache") diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index c0588a6f11..f0fea5a8ce 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -669,7 +669,7 @@ def _compile(self): if ( "CACHE_DIR" not in self.ov_config.keys() and not str(self._model_dir).startswith(gettempdir()) - and self.device.lower() == "gpu" + and self.device.lower().split(":")[0] == "gpu" ): self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache") diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index 617d898be5..d68cbc75ed 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -451,7 +451,7 @@ def _compile(self): if ( "CACHE_DIR" not in ov_config.keys() and not str(self.parent_model.model_save_dir).startswith(gettempdir()) - and self._device.lower() == "gpu" + and "gpu" in self._device.lower() ): cache_dir = Path(self.parent_model.model_save_dir).joinpath("model_cache") ov_config["CACHE_DIR"] = str(cache_dir) @@ -563,7 +563,7 @@ def _compile(self): if ( "CACHE_DIR" not in ov_config.keys() and not str(self.parent_model.model_save_dir).startswith(gettempdir()) - and self._device.lower() == "gpu" + and "gpu" in self._device.lower() ): cache_dir = Path(self.parent_model.model_save_dir).joinpath("model_cache") ov_config["CACHE_DIR"] = str(cache_dir) diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index cbcc696393..a0439d2129 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -20,7 +20,7 @@ import numpy as np from huggingface_hub import model_info -from openvino.runtime import Type, properties +from openvino.runtime import Core, Type, properties from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size @@ -155,3 +155,9 @@ def _print_compiled_model_properties(compiled_model): logger.info(f" {k}: {value}") except Exception: logger.error(f"[error] Get property of '{k}' failed") + try: + logger.info("EXECUTION_DEVICES:") + for device in compiled_model.get_property("EXECUTION_DEVICES"): + logger.info(f" {device}: {Core().get_property(device, 'FULL_DEVICE_NAME')}") + except Exception: + logger.error("[error] Get FULL_DEVICE_NAME failed") From 6b4e24bac99e221f94ec0def3f510fb779a5abc6 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 13 Mar 2024 12:43:20 +0100 Subject: [PATCH 21/30] Improve documentation (#601) * Improve documentation * small fix * remove * Update docs/source/optimization_ov.mdx Co-authored-by: Alexander Kozlov * Update docs/source/inference.mdx Co-authored-by: Helena Kloosterman * fix typo * update paragraph --------- Co-authored-by: Alexander Kozlov Co-authored-by: Helena Kloosterman --- docs/source/inference.mdx | 11 +-- docs/source/optimization_ov.mdx | 141 +++++++++++++++++++------------- 2 files changed, 92 insertions(+), 60 deletions(-) diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx index 65480c1d2f..e0b60baa2e 100644 --- a/docs/source/inference.mdx +++ b/docs/source/inference.mdx @@ -99,21 +99,22 @@ tokenizer.save_pretrained(save_directory) ### Weight-only quantization -You can also apply 8-bit or 4-bit weight quantization when exporting your model with the CLI by setting the `weight-format` argument to respectively `int8` or `int4`: +You can also apply fp16, 8-bit or 4-bit weight compression on the Linear, Convolutional and Embedding layers when exporting your model with the CLI by setting `--weight-format` to respectively `fp16`, `int8` or `int4`: ```bash optimum-cli export openvino --model gpt2 --weight-format int8 ov_model ``` -This will result in the exported model linear and embedding layers to be quantized to INT8 or INT4, the activations will be kept in floating point precision. This type of optimization allows reducing the footprint and latency of LLMs. +This type of optimization allows to reduce the memory footprint and inference latency. -By default the quantization scheme will be [assymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `--sym`. + +By default the quantization scheme will be [asymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `--sym`. For INT4 quantization you can also specify the following arguments : * The `--group-size` parameter will define the group size to use for quantization, `-1` it will results in per-column quantization. * The `--ratio` parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`. -Smaller `group_size` and `ratio` of usually improve accuracy at the sacrifice of the model size and inference latency. +Smaller `group_size` and `ratio` values usually improve accuracy at the sacrifice of the model size and inference latency. You can also apply 8-bit quantization on your model's weight when loading your model by setting the `load_in_8bit=True` argument when calling the `from_pretrained()` method. @@ -125,7 +126,7 @@ model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) -`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. +`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. You can disable it with `load_in_8bit=False`. diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 70c98f14f7..1e78c36805 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -19,15 +19,72 @@ limitations under the License. 🤗 Optimum Intel provides an `openvino` package that enables you to apply a variety of model compression methods such as quantization, pruning, on many models hosted on the 🤗 hub using the [NNCF](https://docs.openvino.ai/2022.1/docs_nncf_introduction.html) framework. -## Post-training optimization +## Post-training -Post-training static quantization introduces an additional calibration step where data is fed through the network in order to compute the activations quantization parameters. -Here is how to apply static quantization on a fine-tuned DistilBERT: +Quantization is a technique to reduce the computational and memory costs of running inference by representing the weights and / or the activations with lower precision data types like 8-bit or 4-bit. + +### Weight-only quantization + +Quantization can be applied on the model's Linear, Convolutional and Embedding layers, enabling the loading of large models on memory-limited devices. For example, when applying 8-bit quantization, the resulting model will be x4 smaller than its fp32 counterpart. For 4-bit quantization, the reduction in memory could theoretically reach x8, but is closer to x6 in practice. + + +#### 8-bit + +For the 8-bit weight quantization you can set `load_in_8bit=True` to load your model's weights in 8-bit: ```python -from functools import partial -from transformers import AutoTokenizer -from optimum.intel import OVConfig, OVQuantizer, OVModelForSequenceClassification, +from optimum.intel import OVModelForCausalLM + +model_id = "helenai/gpt2-ov" +model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) + +# Saves the int8 model that will be x4 smaller than its fp32 counterpart +model.save_pretrained(saving_directory) +``` + + + +`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. You can disable it with `load_in_8bit=False`. + + + +You can also provide a `quantization_config` instead to specify additional optimization parameters. + +#### 4-bit + +For the 4-bit weight quantization, you need a `quantization_config` to define the optimization parameters, for example: + +```python +from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig + +quantization_config = OVWeightQuantizationConfig(bits=4) +model = OVModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config) +``` + +You can tune quantization parameters to achieve a better performance accuracy trade-off as follows: + +```python +quantization_config = OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, dataset="ptb") +``` + +By default the quantization scheme will be [asymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `sym=True`. + +For 4-bit quantization you can also specify the following arguments in the quantization configuration : +* The `group_size` parameter will define the group size to use for quantization, `-1` it will results in per-column quantization. +* The `ratio` parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`. + +Smaller `group_size` and `ratio` values usually improve accuracy at the sacrifice of the model size and inference latency. + +### Static quantization + +When applying post-training static quantization, both the weights and the activations are quantized. +To apply quantization on the activations, an additional calibration step is needed which consists in feeding a `calibration_dataset` to the network in order to estimate the quantization activations parameters. + +Here is how to apply static quantization on a fine-tuned DistilBERT given your own `calibration_dataset`: + +```python +from transformers import AutoTokenizer +from optimum.intel import OVQuantizer, OVModelForSequenceClassification, model_id = "distilbert-base-uncased-finetuned-sst-2-english" model = OVModelForSequenceClassification.from_pretrained(model_id, export=True) @@ -35,11 +92,22 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) # The directory where the quantized model will be saved save_dir = "ptq_model" +quantizer = OVQuantizer.from_pretrained(model) + +# Apply static quantization and export the resulting quantized model to OpenVINO IR format +quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir) +# Save the tokenizer +tokenizer.save_pretrained(save_dir) +``` + +The calibration dataset can also be created easily using your `OVQuantizer`: + +```python +from functools import partial + def preprocess_function(examples, tokenizer): return tokenizer(examples["sentence"], padding="max_length", max_length=128, truncation=True) -# Instantiate our OVQuantizer using the desired configuration -quantizer = OVQuantizer.from_pretrained(model) # Create the calibration dataset used to perform static quantization calibration_dataset = quantizer.get_calibration_dataset( "glue", @@ -48,33 +116,23 @@ calibration_dataset = quantizer.get_calibration_dataset( num_samples=300, dataset_split="train", ) -# Apply static quantization and export the resulting quantized model to OpenVINO IR format -quantizer.quantize( - calibration_dataset=calibration_dataset, - save_directory=save_dir, -) -# Save the tokenizer -tokenizer.save_pretrained(save_dir) ``` -The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device. -## Weight-only quantization +The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device. -You can optimize the performance of text-generation LLMs by quantizing weights to various precisions that provide different performance-accuracy trade-offs. -```python -from optimum.intel import OVModelForCausalLM +### Hybrid quantization -model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) -``` +Traditional optimization methods like post-training 8-bit quantization do not work well for Stable Diffusion (SD) models and can lead to poor generation results. On the other hand, weight compression does not improve performance significantly when applied to Stable Diffusion models, as the size of activations is comparable to weights. +The U-Net component takes up most of the overall execution time of the pipeline. Thus, optimizing just this one component can bring substantial benefits in terms of inference speed while keeping acceptable accuracy without fine-tuning. Quantizing the rest of the diffusion pipeline does not significantly improve inference performance but could potentially lead to substantial accuracy degradation. +Therefore, the proposal is to apply quantization in *hybrid mode* for the U-Net model and weight-only quantization for the rest of the pipeline components : +* U-Net : quantization applied on both the weights and activations +* The text encoder, VAE encoder / decoder : quantization applied on the weights -## Hybrid quantization +The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and activations of other layers, facilitating accuracy preservation post-optimization while reducing the model size. -Traditional optimization methods like post-training 8-bit quantization do not work well for Stable Diffusion models and can lead to poor generation results. On the other hand, weight compression does not improve performance significantly when applied to Stable Diffusion models, as the size of activations is comparable to weights. -The UNet model takes up most of the overall execution time of the pipeline. Thus, optimizing just one model brings substantial benefits in terms of inference speed while keeping acceptable accuracy without fine-tuning. Quantizing the rest of the diffusion pipeline does not significantly improve inference performance but could potentially lead to substantial degradation of accuracy. -Therefore, the proposal is to apply quantization in *hybrid mode* for the UNet model and weight-only quantization for the rest of the pipeline components. The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and activations of other layers, facilitating accuracy preservation post-optimization while reducing the model size. -The `quantization_config` is utilized to define optimization parameters for optimizing the Stable Diffusion pipeline. To enable hybrid quantization, specify the quantization dataset in the `quantization_config`. Otherwise, weight-only quantization to a specified data type (8 tr 4 bits) is applied to UNet model. +The `quantization_config` is utilized to define optimization parameters for optimizing the SD pipeline. To enable hybrid quantization, specify the quantization dataset in the `quantization_config`. If the dataset is not defined, weight-only quantization will be applied on all components. ```python from optimum.intel import OVStableDiffusionPipeline, OVWeightQuantizationConfig @@ -86,38 +144,11 @@ model = OVStableDiffusionPipeline.from_pretrained( ) ``` - - -`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. - - - -For the 4-bit weight quantization you can use the `quantization_config` to specify the optimization parameters, for example: - -```python -from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig - -model = OVModelForCausalLM.from_pretrained( - model_id, - quantization_config=OVWeightQuantizationConfig(bits=4), -) -``` - -You can tune quantization parameters to achieve a better performance accuracy trade-off as follows: - -```python -from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig - -model = OVModelForCausalLM.from_pretrained( - model_id, - quantization_config=OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, dataset="ptb"), -) -``` For more details, please refer to the corresponding NNCF [documentation](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md). -## Training-time optimization +## Training-time Apart from optimizing a model after training like post-training quantization above, `optimum.openvino` also provides optimization methods during training, namely Quantization-Aware Training (QAT) and Joint Pruning, Quantization and Distillation (JPQD). From 2588077a9f57d829b6d2344b3d8b643397137132 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 13 Mar 2024 15:44:06 +0400 Subject: [PATCH 22/30] Add commit id into dev version setup (#597) * add commit id into dev version setup * do not update version if can not get commit --- setup.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/setup.py b/setup.py index ac4056c30d..3a1e1123d0 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,6 @@ +import os import re +import subprocess from setuptools import find_namespace_packages, setup @@ -8,6 +10,19 @@ filepath = "optimum/intel/version.py" with open(filepath) as version_file: (__version__,) = re.findall('__version__ = "(.*)"', version_file.read()) + if __version__.endswith(".dev0"): + dev_version_id = "" + try: + repo_root = os.path.dirname(os.path.realpath(__file__)) + dev_version_id = ( + subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], cwd=repo_root) # nosec + .strip() + .decode() + ) + dev_version_id = "+" + dev_version_id + except subprocess.CalledProcessError: + pass + __version__ = __version__ + dev_version_id except Exception as error: assert False, "Error: Could not open '%s' due %s\n" % (filepath, error) From 8880d2e77c56cb158c96a264cc7f784b1e72189a Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Thu, 14 Mar 2024 17:15:14 +0400 Subject: [PATCH 23/30] Add openvino export configs (#568) * add openvino export configs * more libs * more libs * mixtral and model patcher * chatglm export * rework chatglm config * more testing models * rework config registration * add chatglm in tests * Update tests/openvino/test_modeling.py * fix style * gemma * add test models * qwen * fix failed tests * add comment for gemma --- optimum/exporters/openvino/__init__.py | 2 + optimum/exporters/openvino/__main__.py | 18 +- optimum/exporters/openvino/convert.py | 49 +-- optimum/exporters/openvino/model_configs.py | 391 +++++++++++++++++ optimum/exporters/openvino/model_patcher.py | 441 +++++++++++++++++++- optimum/intel/openvino/modeling_decoder.py | 4 +- optimum/intel/openvino/quantization.py | 2 +- setup.py | 4 +- tests/openvino/test_modeling.py | 72 +++- tests/openvino/utils_tests.py | 8 + 10 files changed, 933 insertions(+), 58 deletions(-) create mode 100644 optimum/exporters/openvino/model_configs.py diff --git a/optimum/exporters/openvino/__init__.py b/optimum/exporters/openvino/__init__.py index 9664f6ae6d..94ea4f103b 100644 --- a/optimum/exporters/openvino/__init__.py +++ b/optimum/exporters/openvino/__init__.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import optimum.exporters.openvino.model_configs + from .__main__ import main_export from .convert import export, export_from_model, export_models, export_pytorch_via_onnx from .stateful import ensure_stateful_is_available, patch_stateful diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 1c695e2f19..02268a3604 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -58,7 +58,7 @@ def main_export( local_files_only: bool = False, use_auth_token: Optional[Union[bool, str]] = None, model_kwargs: Optional[Dict[str, Any]] = None, - custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None, + custom_export_configs: Optional[Dict[str, "OnnxConfig"]] = None, fn_get_submodels: Optional[Callable] = None, compression_option: Optional[str] = None, compression_ratio: Optional[float] = None, @@ -112,11 +112,11 @@ def main_export( when running `transformers-cli login` (stored in `~/.huggingface`). model_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`): Experimental usage: keyword arguments to pass to the model during - the export. This argument should be used along the `custom_onnx_configs` argument + the export. This argument should be used along the `custom_export_configs` argument in case, for example, the model inputs/outputs are changed (for example, if `model_kwargs={"output_attentions": True}` is passed). - custom_onnx_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`): - Experimental usage: override the default ONNX config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model). + custom_export_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`): + Experimental usage: override the default export config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model). fn_get_submodels (`Optional[Callable]`, defaults to `None`): Experimental usage: Override the default submodels that are used at the export. This is especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success. @@ -134,7 +134,7 @@ def main_export( ```python >>> from optimum.exporters.openvino import main_export - >>> main_export("gpt2", output="gpt2_onnx/") + >>> main_export("gpt2", output="gpt2_ov/") ``` """ @@ -206,14 +206,14 @@ def main_export( if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True elif task not in TasksManager.get_supported_tasks_for_model_type( - model_type, exporter="onnx", library_name=library_name + model_type, exporter="openvino", library_name=library_name ): if original_task == "auto": autodetected_message = " (auto-detected)" else: autodetected_message = "" model_tasks = TasksManager.get_supported_tasks_for_model_type( - model_type, exporter="onnx", library_name=library_name + model_type, exporter="openvino", library_name=library_name ) raise ValueError( f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum OpenVINO exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}." @@ -288,7 +288,7 @@ class StoreAttr(object): not custom_architecture and library_name != "diffusers" and task + "-with-past" - in TasksManager.get_supported_tasks_for_model_type(model_type, exporter="onnx", library_name=library_name) + in TasksManager.get_supported_tasks_for_model_type(model_type, exporter="openvino", library_name=library_name) ): # Make -with-past the default if --task was not explicitely specified if original_task == "auto": @@ -319,7 +319,7 @@ class StoreAttr(object): ov_config=ov_config, stateful=stateful, model_kwargs=model_kwargs, - custom_onnx_configs=custom_onnx_configs, + custom_export_configs=custom_export_configs, fn_get_submodels=fn_get_submodels, preprocessors=preprocessors, device=device, diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 5353912d48..dfca80f001 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -32,10 +32,11 @@ from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx +from optimum.exporters.utils import _get_submodels_and_export_configs from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available from optimum.utils.save_utils import maybe_save_preprocessors -from ...intel.utils.import_utils import is_nncf_available, is_optimum_version +from ...intel.utils.import_utils import is_nncf_available from .model_patcher import patch_model_with_bettertransformer from .stateful import ensure_export_task_support_stateful, ensure_stateful_is_available, patch_stateful from .utils import ( @@ -48,13 +49,6 @@ ) -if is_optimum_version(">=", "1.16.99"): - from optimum.exporters.onnx.utils import _get_submodels_and_onnx_configs - -else: - from optimum.exporters.onnx.__main__ import _get_submodels_and_onnx_configs - - UNSUPPORTED_TOKENIZER_CLASSES = (T5Tokenizer, T5TokenizerFast) @@ -418,7 +412,7 @@ def ts_patched_forward(*args, **kwargs): def export_models( - models_and_onnx_configs: Dict[ + models_and_export_configs: Dict[ str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"] ], output_dir: Path, @@ -434,7 +428,7 @@ def export_models( Export the models to OpenVINO IR format Args: - models_and_onnx_configs (Dict[ str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]): + models_and_export_configs (Dict[ str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]): output_dir (Path): output directory for saving models opset (Optional[int], optional, Default to None): ONNX export opset output_names (Optional[List[str]], optional, Defaults to None): model output names @@ -459,20 +453,20 @@ def export_models( outputs = [] - if output_names is not None and len(output_names) != len(models_and_onnx_configs): + if output_names is not None and len(output_names) != len(models_and_export_configs): raise ValueError( - f"Provided custom names {output_names} for the export of {len(models_and_onnx_configs)} models. Please provide the same number of names as models to export." + f"Provided custom names {output_names} for the export of {len(models_and_export_configs)} models. Please provide the same number of names as models to export." ) - for i, model_name in enumerate(models_and_onnx_configs.keys()): - submodel, sub_onnx_config = models_and_onnx_configs[model_name] + for i, model_name in enumerate(models_and_export_configs.keys()): + submodel, sub_export_config = models_and_export_configs[model_name] output_name = output_names[i] if output_names is not None else Path(model_name + ".xml") output_path = output_dir / output_name output_path.parent.mkdir(parents=True, exist_ok=True) outputs.append( export( model=submodel, - config=sub_onnx_config, + config=sub_export_config, output=output_path, opset=opset, device=device, @@ -495,7 +489,7 @@ def export_from_model( stateful: bool = True, opset: Optional[int] = None, model_kwargs: Optional[Dict[str, Any]] = None, - custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None, + custom_export_configs: Optional[Dict[str, "OnnxConfig"]] = None, fn_get_submodels: Optional[Callable] = None, preprocessors: List = None, device: str = "cpu", @@ -524,14 +518,14 @@ def export_from_model( task = TasksManager._infer_task_from_model_or_model_class(model=model) except (ValueError, KeyError) as e: raise RuntimeError( - f"The model task could not be automatically inferred in `onnx_export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" + f"The model task could not be automatically inferred in `export_from_model`. Please provide the argument `task` with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" ) if ( not custom_architecture and library_name != "diffusers" and task + "-with-past" - in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx", library_name=library_name) + in TasksManager.get_supported_tasks_for_model_type(model_type, "openvino", library_name=library_name) ): # -with-past is the default. task = task + "-with-past" @@ -541,9 +535,9 @@ def export_from_model( stateful = stateful and ensure_export_task_support_stateful(task) # TODO: support onnx_config.py in the model repo - if custom_architecture and custom_onnx_configs is None: + if custom_architecture and custom_export_configs is None: raise ValueError( - f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the model type {model_type} to be supported natively in the ONNX export." + f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the model type {model_type} to be supported natively in the ONNX export." ) if task.startswith("text-generation") and model.config.is_encoder_decoder: @@ -569,11 +563,11 @@ def export_from_model( kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name] ) - onnx_config, models_and_onnx_configs = _get_submodels_and_onnx_configs( + export_config, models_and_export_configs = _get_submodels_and_export_configs( model=model, task=task, monolith=False, - custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {}, + custom_export_configs=custom_export_configs if custom_export_configs is not None else {}, custom_architecture=custom_architecture, fn_get_submodels=fn_get_submodels, preprocessors=preprocessors, @@ -581,6 +575,7 @@ def export_from_model( model_kwargs=model_kwargs, _variant="default", legacy=False, + exporter="openvino", ) if ov_config is None: @@ -612,18 +607,18 @@ def export_from_model( model_name_or_path = model.config._name_or_path maybe_save_preprocessors(model_name_or_path, output) - files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_onnx_configs.keys()] + files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] else: # save the subcomponent configuration - for model_name in models_and_onnx_configs: - subcomponent = models_and_onnx_configs[model_name][0] + for model_name in models_and_export_configs: + subcomponent = models_and_export_configs[model_name][0] if hasattr(subcomponent, "save_config"): subcomponent.save_config(output / model_name) elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"): subcomponent.config.save_pretrained(output / model_name) - files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_onnx_configs] + files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_export_configs] # Saving the additional components needed to perform inference. model.scheduler.save_pretrained(output.joinpath("scheduler")) @@ -643,7 +638,7 @@ def export_from_model( model.save_config(output) export_models( - models_and_onnx_configs=models_and_onnx_configs, + models_and_export_configs=models_and_export_configs, output_dir=output, output_names=files_subpaths, input_shapes=input_shapes, diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py new file mode 100644 index 0000000000..b6536512b1 --- /dev/null +++ b/optimum/exporters/openvino/model_configs.py @@ -0,0 +1,391 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union + +from packaging import version +from transformers.utils import is_tf_available + +from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig +from optimum.exporters.onnx.model_configs import GemmaOnnxConfig +from optimum.exporters.tasks import TasksManager +from optimum.utils import DEFAULT_DUMMY_SHAPES +from optimum.utils.input_generators import ( + DummyInputGenerator, + DummyPastKeyValuesGenerator, + DummyTextInputGenerator, + MistralDummyPastKeyValuesGenerator, +) +from optimum.utils.normalized_config import NormalizedTextConfig + +from .model_patcher import ( + BaichuanModelPatcher, + ChatGLMModelPatcher, + GemmaModelPatcher, + MixtralModelPatcher, + QwenModelPatcher, +) + + +def init_model_configs(): + supported_model_types = [ + "_SUPPORTED_MODEL_TYPE", + "_DIFFUSERS_SUPPORTED_MODEL_TYPE", + "_TIMM_SUPPORTED_MODEL_TYPE", + "_SENTENCE_TRANSFORMERS_SUPPORTED_MODEL_TYPE", + ] + + for supported_models_config in supported_model_types: + supported_models = getattr(TasksManager, supported_models_config) + for model, export_configs in supported_models.items(): + if "onnx" not in export_configs: + continue + onnx_config = export_configs["onnx"] + supported_models[model]["openvino"] = deepcopy(onnx_config) + + setattr(TasksManager, supported_models_config, supported_models) + + +init_model_configs() + + +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + + from optimum.exporters.onnx.model_patcher import ModelPatcher + + if is_tf_available(): + from transformers.modeling_tf_utils import TFPreTrainedModel + + +register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True) + + +@register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") +class BaichaunOpenVINOConfig(TextDecoderOnnxConfig): + DEFAULT_ONNX_OPSET = 13 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( + num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" + ) + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return BaichuanModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("qwen2", *["text-generation", "text-generation-with-past"], library_name="transformers") +class Qwen2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("minicpm", *["text-generation", "text-generation-with-past"], library_name="transformers") +class MiniCPMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("stablelm", *["text-generation", "text-generation-with-past"], library_name="transformers") +class StableLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +class ChatGLM2DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + random_batch_size_range=random_batch_size_range, + random_sequence_length_range=random_sequence_length_range, + ) + self.multi_query_group_num = normalized_config.multi_query_group_num + self.head_dim = normalized_config.kv_channels + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + past_key_shape = ( + self.sequence_length, + self.batch_size, + self.multi_query_group_num, + self.head_dim, + ) + past_value_shape = ( + self.sequence_length, + self.batch_size, + self.multi_query_group_num, + self.head_dim, + ) + return [ + ( + self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype), + self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype), + ) + for _ in range(self.num_layers) + ] + + +@register_in_tasks_manager("chatglm", *["text-generation", "text-generation-with-past"], library_name="transformers") +class ChatGLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(vocab_size="padded_vocab_size", num_layers="num_layers") + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, ChatGLM2DummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = ChatGLM2DummyPastKeyValuesGenerator + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): + dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) + + dummy_inputs = {} + input_names = [key for key in self.inputs.keys() if not key.startswith("past_key_values")] + if self.use_past_in_inputs and self.use_cache_branch is not False: + input_names.append("past_key_values") + + for input_name in input_names: + input_was_inserted = False + for dummy_input_gen in dummy_inputs_generators: + if dummy_input_gen.supports_input(input_name): + dummy_inputs[input_name] = self.overwrite_shape_and_generate_input( + dummy_input_gen, + input_name, + framework, + input_shapes=kwargs, + ) + input_was_inserted = True + break + if not input_was_inserted: + raise RuntimeError( + f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.' + ) + + # refer to https://github.com/huggingface/optimum/pull/764 + if ( + self.use_past_in_inputs + and self.PAD_ATTENTION_MASK_TO_PAST + and self.use_cache_branch is not False + and "attention_mask" in dummy_inputs + ): + # Obtain the past sequence length from the value instead of the key (Bloom). ChatGLM has seq_len in 0 dim instead of -2 + past_present_length = dummy_inputs["input_ids"].shape[1] + dummy_inputs["past_key_values"][0][1].shape[0] + + dummy_inputs["attention_mask"] = DummyInputGenerator.pad_input_on_dim( + dummy_inputs["attention_mask"], + desired_length=past_present_length, + dim=1, + dtype=dummy_inputs["attention_mask"].dtype, + ) + + return dummy_inputs + + def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): + """ + Fills `input_or_outputs` mapping with past_key_values dynamic axes considering the direction. + + Args: + inputs_or_outputs (`Dict[str, Dict[int, str]]`): The mapping to fill. + direction (`str`): + either "inputs" or "outputs", it specifies whether `input_or_outputs` is the input mapping or the + output mapping, this is important for axes naming. + """ + if direction not in ["inputs", "outputs"]: + raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') + + if direction == "inputs": + decoder_sequence_name = "past_sequence_length" + name = "past_key_values" + else: + decoder_sequence_name = "past_sequence_length + present_lenght" + name = "present" + + for i in range(self._normalized_config.num_layers): + inputs_or_outputs[f"{name}.{i}.key"] = {1: "batch_size", 0: decoder_sequence_name} + inputs_or_outputs[f"{name}.{i}.value"] = {1: "batch_size", 0: decoder_sequence_name} + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return ChatGLMModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("mixtral", *["text-generation", "text-generation-with-past"], library_name="transformers") +class MixtralOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35 + MIN_TRANSFORMERS_VERSION = version.parse("4.34.99") + + # The ONNX export of this architecture needs the Trilu operator support, available since opset 14 + DEFAULT_ONNX_OPSET = 14 + DUMMY_INPUT_GENERATOR_CLASSES = ( + MistralDummyPastKeyValuesGenerator, + ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return MixtralModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager( + "gemma", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class GemmaOpenVINOConfig(GemmaOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return GemmaModelPatcher(self, model, model_kwargs=model_kwargs) + + +class QwenDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + random_batch_size_range=random_batch_size_range, + random_sequence_length_range=random_sequence_length_range, + ) + self.kv_channels = normalized_config.kv_channels + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + past_key_shape = (self.batch_size, self.sequence_length, self.num_attention_heads, self.kv_channels) + past_value_shape = (self.batch_size, self.sequence_length, self.num_attention_heads, self.kv_channels) + return [ + ( + self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype), + self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype), + ) + for _ in range(self.num_layers) + ] + + +@register_in_tasks_manager("qwen", *["text-generation", "text-generation-with-past"]) +class QwenOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( + num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" + ) + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, QwenDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = QwenDummyPastKeyValuesGenerator + no_position_ids = False + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): + dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) + + dummy_inputs = {} + input_names = [key for key in self.inputs.keys() if not key.startswith("past_key_values")] + if self.use_past_in_inputs and self.use_cache_branch is not False: + input_names.append("past_key_values") + + for input_name in input_names: + input_was_inserted = False + for dummy_input_gen in dummy_inputs_generators: + if dummy_input_gen.supports_input(input_name): + dummy_inputs[input_name] = self.overwrite_shape_and_generate_input( + dummy_input_gen, + input_name, + framework, + input_shapes=kwargs, + ) + input_was_inserted = True + break + if not input_was_inserted: + raise RuntimeError( + f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.' + ) + + # refer to https://github.com/huggingface/optimum/pull/764 + if ( + self.use_past_in_inputs + and self.PAD_ATTENTION_MASK_TO_PAST + and self.use_cache_branch is not False + and "attention_mask" in dummy_inputs + ): + # Obtain the past sequence length from the value instead of the key (Bloom). Qwen has seq_len in 1 dim instead of -2 + past_present_length = dummy_inputs["input_ids"].shape[1] + dummy_inputs["past_key_values"][0][1].shape[1] + + dummy_inputs["attention_mask"] = DummyInputGenerator.pad_input_on_dim( + dummy_inputs["attention_mask"], + desired_length=past_present_length, + dim=1, + dtype=dummy_inputs["attention_mask"].dtype, + ) + + return dummy_inputs + + def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): + """ + Fills `input_or_outputs` mapping with past_key_values dynamic axes considering the direction. + + Args: + inputs_or_outputs (`Dict[str, Dict[int, str]]`): The mapping to fill. + direction (`str`): + either "inputs" or "outputs", it specifies whether `input_or_outputs` is the input mapping or the + output mapping, this is important for axes naming. + """ + if direction not in ["inputs", "outputs"]: + raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') + + if direction == "inputs": + decoder_sequence_name = "past_sequence_length" + name = "past_key_values" + else: + decoder_sequence_name = "past_sequence_length + 1" + name = "present" + + for i in range(self._normalized_config.num_layers): + inputs_or_outputs[f"{name}.{i}.key"] = {0: "batch_size", 1: decoder_sequence_name} + inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch_size", 1: decoder_sequence_name} + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return QwenModelPatcher(self, model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 91dc48df05..371fee732a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -1,4 +1,4 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,15 @@ # limitations under the License. import logging as log +import types +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +import torch +import torch.nn.functional as F +from transformers.modeling_outputs import BaseModelOutputWithPast +from transformers.utils import is_tf_available + +from optimum.exporters.onnx.model_patcher import DecoderModelPatcher from optimum.intel.utils.import_utils import ( _openvino_version, _torch_version, @@ -24,6 +32,15 @@ ) +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + + from optimum.exporters.onnx.config import OnnxConfig + + if is_tf_available(): + from transformers.modeling_tf_utils import TFPreTrainedModel + + def patch_model_with_bettertransformer(model): COLOR_RED = "\033[1;31m" COLOR_RESET = "\033[0m" @@ -71,3 +88,425 @@ def patch_model_with_bettertransformer(model): return model return model + + +def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ """ + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + router_logits = self.gate(hidden_states) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + # we cast back to the input dtype + routing_weights = routing_weights.to(hidden_states.dtype) + + final_hidden_states = torch.zeros( + (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device + ) + + # One hot encode the selected experts to create an expert mask + # this will be used to easily index which expert is going to be sollicitated + expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) + + # Loop over all available experts in the model and perform the computation on each expert + for expert_idx in range(self.num_experts): + expert_layer = self.experts[expert_idx] + idx, top_x = torch.where(expert_mask[expert_idx]) + + # Index the correct hidden states and compute the expert hidden state for + # the current expert. We need to make sure to multiply the output hidden + # states by `routing_weights` on the corresponding tokens (top-1 and top-2) + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] + + final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype)) + final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim) + return final_hidden_states, router_logits + + +class MixtralModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.model.layers: + layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward + layer.block_sparse_moe.forward = types.MethodType( + _mixtral_sparse_moe_block_forward, layer.block_sparse_moe + ) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward + + +def _chatglm_transformer_forward( + self, + input_ids, + position_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.BoolTensor] = None, + full_attention_mask: Optional[torch.BoolTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = input_ids.shape + + if inputs_embeds is None: + inputs_embeds = self.embedding(input_ids) + + if self.pre_seq_len is not None: + if past_key_values is None: + past_key_values = self.get_prompt( + batch_size=batch_size, + device=input_ids.device, + dtype=inputs_embeds.dtype, + ) + if attention_mask is not None: + attention_mask = torch.cat( + [ + attention_mask.new_ones((batch_size, self.pre_seq_len)), + attention_mask, + ], + dim=-1, + ) + + if full_attention_mask is None: + if past_key_values is not None: + full_attention_mask = torch.ones( + batch_size, + seq_length, + seq_length, + device=input_ids.device, + dtype=torch.float, + ) * float("-inf") + full_attention_mask.triu_(diagonal=1) + past_length = 0 + if past_key_values: + past_length = past_key_values[0][0].shape[0] + if past_length: + full_attention_mask = torch.cat( + ( + torch.zeros(batch_size, seq_length, past_length, device=input_ids.device), + full_attention_mask, + ), + dim=-1, + ) + full_attention_mask.unsqueeze_(1) + + # Rotary positional embeddings + rotary_pos_emb = self.rotary_pos_emb(self.seq_length) + if position_ids is not None: + rotary_pos_emb = rotary_pos_emb[position_ids] + else: + rotary_pos_emb = rotary_pos_emb[None, :seq_length] + rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() + + # Run encoder. + hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( + inputs_embeds, + full_attention_mask, + rotary_pos_emb=rotary_pos_emb, + kv_caches=past_key_values, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + ) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +def _chatglm2_get_context_layer(query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor): + mask = torch.zeros((query_layer.shape[-2], key_layer.shape[-2]), dtype=query_layer.dtype) + if query_layer.shape[2] == key_layer.shape[2]: + tmp_mask = torch.ones((query_layer.shape[-2], key_layer.shape[-2]), dtype=torch.bool).triu(diagonal=1) + mask.masked_fill_(tmp_mask, float("-inf")) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, key_layer, value_layer, attn_mask=mask + ) + return context_layer + + +def _chatglm2_core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask): + query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] + if attention_mask is None: + context_layer = _chatglm2_get_context_layer(query_layer, key_layer, value_layer) + else: + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, key_layer, value_layer, attention_mask + ) + context_layer = context_layer.permute(2, 0, 1, 3) + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.reshape(*new_context_layer_shape) + + return context_layer + + +class ChatGLMModelPatcher(DecoderModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any], + ): + super().__init__(config, model, model_kwargs) + + self.original_chatglm_transformer_forward = model.transformer.forward + + def __enter__(self): + super().__enter__() + self._model.transformer.forward = types.MethodType(_chatglm_transformer_forward, self._model.transformer) + for block in self._model.transformer.encoder.layers: + block.self_attention.core_attention._orig_forward = block.self_attention.core_attention.forward + block.self_attention.core_attention.forward = types.MethodType( + _chatglm2_core_attention_forward, block.self_attention.core_attention + ) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.transformer.forward = self.original_chatglm_transformer_forward + for block in self._model.transformer.encoder.layers: + block.self_attention.core_attention.forward = block.self_attention.core_attention._orig_forward + + +class GemmaModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + + # init inv_freq for torchscript tracing + # https://github.com/huggingface/transformers/blob/ed74d97871468f3a4695ede50abdc0b55717a84d/src/transformers/models/gemma/modeling_gemma.py#L108 + for layer in self._model.model.layers: + if layer.self_attn.rotary_emb.inv_freq is None: + rotary_emb = layer.self_attn.rotary_emb + layer.self_attn.rotary_emb.inv_freq = 1.0 / ( + rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim) + ) + + +SUPPORT_SDPA = is_torch_version(">", "2.1.0") + + +def _qwen_rotate_half(x): + from einops import rearrange + + x = rearrange(x, "... (j d) -> ... j d", j=2) + x1, x2 = x.unbind(dim=-2) + return torch.cat((-x2, x1), dim=-1) + + +def _qwen_apply_rotary_pos_emb(t, freqs): + cos, sin = freqs + rot_dim = freqs[0].shape[-1] + cos, sin = freqs + t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:] + t_ = t_.float() + t_pass_ = t_pass_.float() + t_ = (t_ * cos) + (_qwen_rotate_half(t_) * sin) + return torch.cat((t_, t_pass_), dim=-1).type_as(t) + + +def _qwen_quantize_cache_v(fdata, bits, qmax, qmin): + # b, s, head, h-dim->b, head, s, h-dim + qtype = torch.uint8 + device = fdata.device + shape = fdata.shape + + fdata_cal = torch.flatten(fdata, 2) + fmax = torch.amax(fdata_cal, dim=-1, keepdim=True) + fmin = torch.amin(fdata_cal, dim=-1, keepdim=True) + # Compute params + if qmax.device != fmax.device: + qmax = qmax.to(device) + qmin = qmin.to(device) + scale = (fmax - fmin) / (qmax - qmin) + zero = qmin - fmin / scale + scale = scale.unsqueeze(-1).repeat(1, 1, shape[2], 1).contiguous() + zero = zero.unsqueeze(-1).repeat(1, 1, shape[2], 1).contiguous() + # Quantize + res_data = fdata / scale + zero + qdata = torch.clamp(res_data, qmin, qmax).to(qtype) + return qdata.contiguous(), scale, zero + + +def _qwen_attention_forward( + self, + hidden_states: Optional[Tuple[torch.FloatTensor]], + rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None, + layer_past: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, +): + mixed_x_layer = self.c_attn(hidden_states) + + query, key, value = mixed_x_layer.split(self.split_size, dim=2) + + query = self._split_heads(query, self.num_heads, self.head_dim) + key = self._split_heads(key, self.num_heads, self.head_dim) + value = self._split_heads(value, self.num_heads, self.head_dim) + + if rotary_pos_emb_list is not None: + cur_len = query.shape[1] + if len(rotary_pos_emb_list) == 1: + rotary_pos_emb = rotary_pos_emb_list[0] + rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb] + rotary_pos_emb = (rotary_pos_emb,) * 2 + q_pos_emb, k_pos_emb = rotary_pos_emb + # Slice the pos emb for current inference + query = _qwen_apply_rotary_pos_emb(query, q_pos_emb) + key = _qwen_apply_rotary_pos_emb(key, k_pos_emb) + else: + query_list = [] + key_list = [] + for i, rotary_pos_emb in enumerate(rotary_pos_emb_list): + rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb] + rotary_pos_emb = (rotary_pos_emb,) * 2 + q_pos_emb, k_pos_emb = rotary_pos_emb + # Slice the pos emb for current inference + query_list += [_qwen_apply_rotary_pos_emb(query[i : i + 1, :, :], q_pos_emb)] + key_list += [_qwen_apply_rotary_pos_emb(key[i : i + 1, :, :], k_pos_emb)] + query = torch.cat(query_list, dim=0) + key = torch.cat(key_list, dim=0) + + if self.use_cache_quantization: + key = _qwen_quantize_cache_v(key.permute(0, 2, 1, 3), bits=8, qmin=self.cache_qmin, qmax=self.cache_qmax) + value = _qwen_quantize_cache_v(value.permute(0, 2, 1, 3), bits=8, qmin=self.cache_qmin, qmax=self.cache_qmax) + + if layer_past is not None: + past_key, past_value = layer_past[0], layer_past[1] + if self.use_cache_quantization: + # use_cache_quantization: + # present=((q_key,key_scale,key_zero_point), + # (q_value,value_scale,value_zero_point)) + key = ( + torch.cat((past_key[0], key[0]), dim=2), + torch.cat((past_key[1], key[1]), dim=2), + torch.cat((past_key[2], key[2]), dim=2), + ) + value = ( + torch.cat((past_value[0], value[0]), dim=2), + torch.cat((past_value[1], value[1]), dim=2), + torch.cat((past_value[2], value[2]), dim=2), + ) + else: + # not use_cache_quantization: + # present=(key,value) + key = torch.cat((past_key, key), dim=1) + value = torch.cat((past_value, value), dim=1) + + if use_cache: + present = (key, value) + else: + present = None + + if self.use_logn_attn and not self.training: + if self.use_cache_quantization: + seq_start = key[0].size(2) - query.size(1) + seq_end = key[0].size(2) + else: + seq_start = key.size(1) - query.size(1) + seq_end = key.size(1) + logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :].type_as(query) + query = query * logn_tensor.expand_as(query) + + if self.use_flash_attn and not self.is_fp32 and query.is_cuda: + q, k, v = query, key, value + attn_output = self.core_attention_flash(q, k, v, attention_mask=attention_mask) + else: + registered_causal_mask = torch.tril( + torch.ones((key.size(1), key.size(1)), dtype=torch.bool, device=key.device) + ).view(1, 1, key.size(1), key.size(1)) + query = query.permute(0, 2, 1, 3) + if not self.use_cache_quantization: + key = key.permute(0, 2, 1, 3) + value = value.permute(0, 2, 1, 3) + + if not self.use_cache_quantization and SUPPORT_SDPA: + causal_mask = registered_causal_mask[:, :, key.size(-2) - query.size(-2) : key.size(-2), : key.size(-2)] + if attention_mask is not None: + attention_mask = attention_mask.expand(-1, -1, causal_mask.size(2), -1).masked_fill( + ~causal_mask, torch.finfo(query.dtype).min + ) + else: + attention_mask = causal_mask + attn_output = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask).transpose(1, 2) + attn_weight = None + else: + attn_output, attn_weight = self._attn(query, key, value, registered_causal_mask, attention_mask, head_mask) + context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim) + + attn_output = self.c_proj(context_layer) + + outputs = (attn_output, present) + if output_attentions: + if self.use_flash_attn and not self.is_fp32: + raise ValueError("Cannot output attentions while using flash-attn") + else: + outputs += (attn_weight,) + + return outputs + + +class QwenModelPatcher(DecoderModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any], + ): + super().__init__(config, model, model_kwargs) + + self.original_fp16 = model.config.fp16 + self.original_bf16 = model.config.bf16 + model.config.bf16 = False + model.config.fp16 = False + if self.original_fp16 or self.original_bf16: + model.to(torch.float32) + model.transformer.rotary_emb(2048) + + def __enter__(self): + super().__enter__() + for block in self._model.transformer.h: + block.attn._orig_forward = block.attn.forward + block.attn.forward = types.MethodType(_qwen_attention_forward, block.attn) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for block in self._model.transformer.h: + block.attn.forward = block.attn._orig_forward + self._model.config.bf16 = self.original_bf16 + self._model.config.fp16 = self.original_fp16 + + +class BaichuanModelPatcher(DecoderModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any], + ): + super().__init__(config, model, model_kwargs) + # model has first inference buffers initialization + if self._model.lm_head.first_flag: + self._model(torch.ones((1, 10), dtype=torch.int64), torch.ones((1, 10), dtype=torch.int64)) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 53aa05bc5a..832c132615 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -316,7 +316,9 @@ def _reshape( shapes[inputs][0] = -1 input_name = inputs.get_any_name() if input_name.startswith("past_key_values"): - if len(inputs.partial_shape) == 3 and input_name.endswith("value"): + if ( + len(inputs.partial_shape) == 3 and input_name.endswith("value") + ) or self.config.model_type == "chatglm": shapes[inputs][1] = -1 else: shapes[inputs][2] = -1 diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index c46f29092b..2022a495d8 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -350,7 +350,7 @@ def _quantize_torchmodel( model_type = self.model.config.model_type.replace("_", "-") onnx_config_class = TasksManager.get_exporter_config_constructor( - exporter="onnx", + exporter="openvino", model=self.model, task=self.task, model_type=model_type, diff --git a/setup.py b/setup.py index 3a1e1123d0..5c6cf76404 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=1.11", - "optimum~=1.17", "transformers>=4.36.0,<4.39.0", + "optimum @ git+https://github.com/huggingface/optimum.git#egg=optimum", "datasets>=1.4.0", "sentencepiece", "scipy", @@ -50,6 +50,8 @@ "timm", "invisible-watermark>=0.2.0", "auto-gptq", + "transformers_stream_generator", + "einops", ] QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 2188b7061f..9df6c73214 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -28,6 +28,7 @@ from parameterized import parameterized from PIL import Image from transformers import ( + AutoConfig, AutoFeatureExtractor, AutoModel, AutoModelForAudioClassification, @@ -52,7 +53,6 @@ from transformers.onnx.utils import get_preprocessor from utils_tests import MODEL_NAMES -from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.intel import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, @@ -473,73 +473,101 @@ def test_pipeline(self, model_arch): class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( "bart", + "baichuan2", "gpt_bigcode", "blenderbot", "blenderbot-small", "bloom", + "chatglm", "codegen", # "data2vec-text", # TODO : enable when enabled in exporters + "gemma", "gpt2", "gpt_neo", "gpt_neox", "llama", # "llama_gptq", "marian", + "minicpm", "mistral", + "mixtral", "mpt", "opt", "pegasus", + "qwen", + "qwen2", + "stablelm", ) GENERATION_LENGTH = 100 IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") + REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen") @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] + not_stateful = ["gpt_bigcode"] + if is_openvino_version("<", "2024.0"): + not_stateful.append("mixtral") + + if is_openvino_version("<", "2024.1"): + not_stateful.extend(["llama", "gemma"]) if "gptq" in model_arch: self.skipTest("GPTQ model loading unsupported with AutoModelForCausalLM") set_seed(SEED) - ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG) + + model_kwargs = {} + if model_arch in self.REMOTE_CODE_MODELS: + model_kwargs = { + "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), + "trust_remote_code": True, + } + ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs) self.assertIsInstance(ov_model.config, PretrainedConfig) self.assertTrue(ov_model.use_cache) - - transformers_model = AutoModelForCausalLM.from_pretrained(model_id) - tokenizer = AutoTokenizer.from_pretrained(model_id) + self.assertEqual( + ov_model.stateful, self.IS_SUPPORT_STATEFUL and ov_model.config.model_type not in not_stateful + ) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + if model_arch == "qwen": + transformers_model.to(torch.float32) tokens = tokenizer( "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None ) - position_ids = None - if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: - input_shape = tokens["input_ids"].shape - position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) - ov_outputs = ov_model(**tokens, position_ids=position_ids) + ov_outputs = ov_model(**tokens) self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) self.assertTrue("past_key_values" in ov_outputs) self.assertIsInstance(ov_outputs.past_key_values, tuple) - - is_stateful = ov_model.config.model_type not in {"gpt_bigcode", "llama"} and self.IS_SUPPORT_STATEFUL + is_stateful = ov_model.config.model_type not in not_stateful and self.IS_SUPPORT_STATEFUL self.assertEqual(ov_model.stateful, is_stateful) if is_stateful: self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) - with torch.no_grad(): transformers_outputs = transformers_model(**tokens) # Compare tensor outputs - self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4)) + self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=1e-4)) del transformers_model del ov_model gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): + model_kwargs = {} model_id = MODEL_NAMES[model_arch] - tokenizer = AutoTokenizer.from_pretrained(model_id) - model = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False, compile=False) + if model_arch in self.REMOTE_CODE_MODELS: + model_kwargs = { + "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), + "trust_remote_code": True, + } + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + model = OVModelForCausalLM.from_pretrained( + model_id, export=True, use_cache=False, compile=False, **model_kwargs + ) model.config.encoder_no_repeat_ngram_size = 0 model.to("cpu") model.half() @@ -556,8 +584,16 @@ def test_pipeline(self, model_arch): def test_multiple_inputs(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) - model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False) - tokenizer = AutoTokenizer.from_pretrained(model_id) + if model_arch == "qwen": + self.skipTest("Qwen tokenizer does not support padding") + model_kwargs = {} + if model_arch in self.REMOTE_CODE_MODELS: + model_kwargs = { + "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), + "trust_remote_code": True, + } + model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, **model_kwargs) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) tokenizer.pad_token = tokenizer.eos_token texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"] tokens = tokenizer(texts, padding=True, return_tensors="pt") diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 97c8a92836..ad3cd03d3d 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -22,12 +22,14 @@ "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", "bert": "hf-internal-testing/tiny-random-bert", "bart": "hf-internal-testing/tiny-random-bart", + "baichuan2": "katuni4ka/tiny-random-baichuan2", "bigbird_pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", "bloom": "hf-internal-testing/tiny-random-BloomModel", "camembert": "hf-internal-testing/tiny-random-camembert", "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", + "chatglm": "katuni4ka/tiny-random-chatglm2", "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", "data2vec_text": "hf-internal-testing/tiny-random-Data2VecTextModel", "data2vec_vision": "hf-internal-testing/tiny-random-Data2VecVisionModel", @@ -38,6 +40,7 @@ "convnext": "hf-internal-testing/tiny-random-convnext", "distilbert": "hf-internal-testing/tiny-random-distilbert", "electra": "hf-internal-testing/tiny-random-electra", + "gemma": "fxmarty/tiny-random-GemmaForCausalLM", "flaubert": "hf-internal-testing/tiny-random-flaubert", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", @@ -55,7 +58,9 @@ "opt125m": "facebook/opt-125m", "marian": "sshleifer/tiny-marian-en-de", "mbart": "hf-internal-testing/tiny-random-mbart", + "minicpm": "katuni4ka/tiny-random-minicpm", "mistral": "echarlaix/tiny-random-mistral", + "mixtral": "TitanML/tiny-mixtral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet_v1": "google/mobilenet_v1_0.75_192", "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", @@ -66,6 +71,8 @@ "pegasus": "hf-internal-testing/tiny-random-pegasus", "pix2struct": "fxmarty/pix2struct-tiny-random", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "qwen": "katuni4ka/tiny-random-qwen", + "qwen2": "Qwen/Qwen1.5-0.5B", "resnet": "hf-internal-testing/tiny-random-resnet", "roberta": "hf-internal-testing/tiny-random-roberta", "roformer": "hf-internal-testing/tiny-random-roformer", @@ -76,6 +83,7 @@ "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "stable-diffusion-xl-refiner": "echarlaix/tiny-random-stable-diffusion-xl-refiner", + "stablelm": "hf-internal-testing/tiny-random-StableLmForCausalLM", "latent-consistency": "echarlaix/tiny-random-latent-consistency", "sew": "hf-internal-testing/tiny-random-SEWModel", "sew_d": "asapp/sew-d-tiny-100k-ft-ls100h", From f03b3569a523bc6760a2b34bc3e0dc62dbb2c9b1 Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Fri, 15 Mar 2024 15:37:20 +0100 Subject: [PATCH 24/30] Update transformers version warning for gemma (#609) --- optimum/exporters/openvino/model_patcher.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 371fee732a..a31287d84f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -60,7 +60,7 @@ def patch_model_with_bettertransformer(model): ) if ( - getattr(model.config, "model_type") in {"gpt_bigcode", "llama"} + getattr(model.config, "model_type") in {"gpt_bigcode", "llama", "gemma"} and is_transformers_version(">=", "4.38") and is_openvino_version("<", "2024.1.0-14612") ): @@ -69,10 +69,11 @@ def patch_model_with_bettertransformer(model): _openvino_version.split("-")[0] if is_openvino_version("<=", "2024.0.0-14509") else _openvino_version ) log.warn( - COLOR_RED + f"[WARNING] Stateful models are not supported for Llama and GPTBigCode with Transformers " + COLOR_RED + + f"[WARNING] Stateful models are not supported for Llama, Gemma and GPTBigCode with Transformers " f"{_transformers_version} and OpenVINO {display_version}. For good performance, consider using a nightly OpenVINO build: " - "https://docs.openvino.ai/2024/get-started/install-openvino.html. For models that do not need transformers " - "4.38+, it is also an option to downgrade transformers: `pip install transformers==4.37.2`" + COLOR_RESET + "https://docs.openvino.ai/2024/get-started/install-openvino.html. For gpt-bigcode and llama models, " + "it is also an option to downgrade transformers: `pip install transformers==4.37.2`" + COLOR_RESET ) # model already has required SDPA implementation From 334fc103be4b86aa8a35781721057c553b8538b5 Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Fri, 15 Mar 2024 15:37:44 +0100 Subject: [PATCH 25/30] Add test for openvino-nightly (#607) --- .github/workflows/test_openvino.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 6d709eecfd..ba5b09ff81 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -36,3 +36,9 @@ jobs: - name: Test with Pytest run: | pytest tests/openvino/ --ignore test_modeling_basic + - name: Test openvino-nightly + run: | + pip uninstall -y openvino + pip install openvino-nightly + python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)" + optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov From 948f99d20a93a066351f7ee6a63b204fc48fc4b4 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 15 Mar 2024 18:41:56 +0400 Subject: [PATCH 26/30] Apply bettertransformer by default for causallm (#605) --- optimum/exporters/openvino/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index dfca80f001..8c49994874 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -293,7 +293,7 @@ def export_pytorch( logger.info(f"Using framework PyTorch: {torch.__version__}") output = Path(output) - if stateful: + if ensure_export_task_support_stateful(config.task): # Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect # both of them are applied to demonstrate the best performance. # TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation. From cd3bc8a6907e7fda0da81fd8d3a6e9f6f2c759a8 Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Fri, 15 Mar 2024 15:48:22 +0100 Subject: [PATCH 27/30] Fix notebooks (#603) - use dataset for audio example to fix permission issue with URL - change pytorch_model.bin to model.safetensors - speed up quantization notebook test by using only 10 samples - update half() messaging (no longer needed to do half() on GPU) --- .github/workflows/test_openvino_notebooks.yml | 2 + .../openvino/optimum_openvino_inference.ipynb | 428 +++++++++--------- .../question_answering_quantization.ipynb | 10 +- 3 files changed, 228 insertions(+), 212 deletions(-) diff --git a/.github/workflows/test_openvino_notebooks.yml b/.github/workflows/test_openvino_notebooks.yml index abc2a65440..7b037d0565 100644 --- a/.github/workflows/test_openvino_notebooks.yml +++ b/.github/workflows/test_openvino_notebooks.yml @@ -49,5 +49,7 @@ jobs: - name: Test with Pytest run: | + sed -i 's/NUM_TRAIN_ITEMS = 600/NUM_TRAIN_ITEMS = 10/' notebooks/openvino/question_answering_quantization.ipynb + sed -i 's/# %pip install/%pip install/' notebooks/openvino/optimum_openvino_inference.ipynb python -m pytest --nbval-lax notebooks/openvino/optimum_openvino_inference.ipynb notebooks/openvino/question_answering_quantization.ipynb diff --git a/notebooks/openvino/optimum_openvino_inference.ipynb b/notebooks/openvino/optimum_openvino_inference.ipynb index 446e668911..b94238d358 100644 --- a/notebooks/openvino/optimum_openvino_inference.ipynb +++ b/notebooks/openvino/optimum_openvino_inference.ipynb @@ -9,7 +9,7 @@ "\n", "This notebook is a playground for running inference with OpenVINO on Transformers models with Optimum. The first part of this notebook explains the different ways to load a model, and some options specific to OpenVINO, like doing inference on an Intel GPU. The second part of this notebook consists of small examples for different supported tasks. \n", "\n", - "Do not forget to install the required dependencies before running this notebook with `pip install optimum[openvino] ipywidgets pillow torchaudio` or uncomment the cell below to install these requirements in your current Python environment. The audio classification example requires [ffmpeg](https://ffmpeg.org/download.html)." + "Do not forget to install the required dependencies before running this notebook by uncommenting the cell below to install these requirements in your current Python environment. The audio classification example requires [ffmpeg](https://ffmpeg.org/download.html)." ] }, { @@ -17,18 +17,11 @@ "execution_count": 1, "id": "6a6774ad-912b-4053-b7f6-14dc020807ef", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:17.121564Z", - "iopub.status.busy": "2023-03-12T19:47:17.121264Z", - "iopub.status.idle": "2023-03-12T19:47:17.125098Z", - "shell.execute_reply": "2023-03-12T19:47:17.124669Z", - "shell.execute_reply.started": "2023-03-12T19:47:17.121531Z" - }, "tags": [] }, "outputs": [], "source": [ - "# %pip install optimum[openvino] ipywidgets pillow torchaudio" + "# %pip install optimum[openvino] ipywidgets pillow torchaudio soundfile librosa" ] }, { @@ -52,13 +45,6 @@ "execution_count": 2, "id": "0c89b2a2-ce31-4773-9454-3e0e57d1a231", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:17.127386Z", - "iopub.status.busy": "2023-03-12T19:47:17.127169Z", - "iopub.status.idle": "2023-03-12T19:47:24.576408Z", - "shell.execute_reply": "2023-03-12T19:47:24.575840Z", - "shell.execute_reply.started": "2023-03-12T19:47:17.127372Z" - }, "tags": [] }, "outputs": [ @@ -66,15 +52,26 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/openvino/offline_transformations/__init__.py:10: FutureWarning: The module is private and following namespace `offline_transformations` will be removed in the future.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n" + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " _torch_pytree._register_pytree_node(\n", + "Framework not specified. Using pt to export the model.\n", + "Using the export variant default. Available variants are:\n", + " - default: The default ONNX variant.\n", + "Using framework PyTorch: 2.2.0+cpu\n", + "/home/helena/venvs/openvino_env/lib/python3.10/site-packages/transformers/models/distilbert/modeling_distilbert.py:246: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n", + " mask, torch.tensor(torch.finfo(scores.dtype).min)\n", + "Compiling the model to CPU ...\n", + "Compiling the model to CPU ...\n" ] } ], @@ -104,13 +101,6 @@ "execution_count": 3, "id": "8053abe3-0e1f-445d-8397-630efac28269", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:24.577918Z", - "iopub.status.busy": "2023-03-12T19:47:24.577422Z", - "iopub.status.idle": "2023-03-12T19:47:25.276093Z", - "shell.execute_reply": "2023-03-12T19:47:25.275685Z", - "shell.execute_reply.started": "2023-03-12T19:47:24.577895Z" - }, "tags": [] }, "outputs": [ @@ -154,13 +144,6 @@ "execution_count": 4, "id": "dcf7a5c3-81ba-42cb-a7d9-d22c5bb00325", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:25.276779Z", - "iopub.status.busy": "2023-03-12T19:47:25.276603Z", - "iopub.status.idle": "2023-03-12T19:47:25.278703Z", - "shell.execute_reply": "2023-03-12T19:47:25.278355Z", - "shell.execute_reply.started": "2023-03-12T19:47:25.276764Z" - }, "tags": [] }, "outputs": [], @@ -174,20 +157,20 @@ "execution_count": 5, "id": "648a8eb1-1d50-4503-8094-c9a88098bee9", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:25.279302Z", - "iopub.status.busy": "2023-03-12T19:47:25.279139Z", - "iopub.status.idle": "2023-03-12T19:47:26.802333Z", - "shell.execute_reply": "2023-03-12T19:47:26.801969Z", - "shell.execute_reply.started": "2023-03-12T19:47:25.279288Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ - "{'score': 0.8515876531600952,\n", + "{'score': 0.8515874147415161,\n", " 'start': 12,\n", " 'end': 64,\n", " 'answer': 'a framework for deep learning inference optimization'}" @@ -234,16 +217,16 @@ "execution_count": 6, "id": "c5d8d4be-3449-4a78-aa92-56ef2b355572", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:26.802936Z", - "iopub.status.busy": "2023-03-12T19:47:26.802808Z", - "iopub.status.idle": "2023-03-12T19:47:27.259642Z", - "shell.execute_reply": "2023-03-12T19:47:27.259125Z", - "shell.execute_reply.started": "2023-03-12T19:47:26.802923Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ @@ -285,10 +268,10 @@ "### OpenVINO features\n", "\n", "- For improved performance, it is sometimes useful to reshape the model to use static input shapes\n", - "- Models can be compressed to FP16, which reduces model size by half, and improves performance on GPU (because GPUs contain optimizations for computations with FP16 data).\n", + "- On GPU, inference uses FP16 by default (GPUs contain optimizations for computations with FP16 data). On 4th generation and later Intel® Xeon® Scalable Processors, inference uses BF16 by default. \n", "- OpenVINO supports inference on Intel GPU, either an integrated GPU in your laptop or desktop, or an Intel discrete GPU, for example Intel Arc. \n", "\n", - "By default, when loading a model with `model = OVModelForXxx.from_pretrained(model_id)`, it is compiled on CPU. If you know you want to use GPU inference, static shapes, or FP16, you can set `compile=False` to the `.from_pretrained()` method, to skip the compilation step, as the model will have to be compiled again after steps such as reshaping, fp16 conversion or changing device. The model can then be compile with `model.compile()`. In the case the model was not compiled, it will be automatically done before the first inference, resulting in an increase of the first inference latency, since it will include the model compilation time." + "By default, when loading a model with `model = OVModelForXxx.from_pretrained(model_id)`, it is compiled on CPU. If you need to modify the model, for example to use static shapes, you can set `compile=False` to the `.from_pretrained()` method, to skip the compilation step, as the model will have to be compiled again after steps such as reshaping or changing device. The model can then be compile with `model.compile()`. In the case the model was not compiled, it will be automatically done before the first inference, resulting in an increase of the first inference latency, since it will include the model compilation time." ] }, { @@ -316,20 +299,20 @@ "execution_count": 7, "id": "e0754efa-0beb-4060-8633-daecc5ebca31", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:27.261759Z", - "iopub.status.busy": "2023-03-12T19:47:27.261529Z", - "iopub.status.idle": "2023-03-12T19:47:31.027499Z", - "shell.execute_reply": "2023-03-12T19:47:31.027078Z", - "shell.execute_reply.started": "2023-03-12T19:47:27.261738Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ - "{'score': 0.7991431951522827,\n", + "{'score': 0.7991424798965454,\n", " 'start': 12,\n", " 'end': 62,\n", " 'answer': 'a toolkit for deep learning inference optimization'}" @@ -366,12 +349,12 @@ }, { "cell_type": "markdown", - "id": "3a564882-dfd2-4f62-803b-f1e3485736c3", + "id": "8798b8d5-50ad-439f-a1d8-886d579a91fe", "metadata": {}, "source": [ - "#### Compressing model weights to FP16\n", + "#### Saving Model in FP16 format\n", "\n", - "Compressing model weights saves disk space, and speeds up inference on Intel GPU." + "`model.half()` converts the model weights to FP16 precision. This reduces the size of the model by half, with usually a negligible impact on accuracy." ] }, { @@ -379,13 +362,6 @@ "execution_count": 8, "id": "09f443d5-ff58-416c-ab70-bee16e9f8235", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:31.028314Z", - "iopub.status.busy": "2023-03-12T19:47:31.028164Z", - "iopub.status.idle": "2023-03-12T19:47:32.026899Z", - "shell.execute_reply": "2023-03-12T19:47:32.026017Z", - "shell.execute_reply.started": "2023-03-12T19:47:31.028301Z" - }, "tags": [] }, "outputs": [], @@ -401,7 +377,16 @@ "source": [ "#### Loading Model on GPU\n", "\n", - "For GPU inference, we recommend using FP16. OpenVINO support for dynamic shapes on GPU is in preview mode, so for now we recommend using static shapes.\n", + "A model can be loaded to GPU by using `model.to(\"GPU\")` or by passing `device` to `from_pretrained()`.\n", + "\n", + "GPU inference will automatically run with FP16 precision, regardless of the precision of the weights of the model. To override this, and force FP32 precision you can pass an `ov_config` argument to `.from_pretrained()`: \n", + "\n", + "```\n", + "model = OVModelForQuestionAnswering.from_pretrained(model_id,\n", + " device_name=\"GPU\",\n", + " ov_config={\"INFERENCE_PRECISION_HINT\": \"f32\"}\n", + ")\n", + "```\n", "\n", "OpenVINO's `Core().available_devices` property shows the supported devices on the system. " ] @@ -411,13 +396,6 @@ "execution_count": 9, "id": "09f13ef7-2321-47c6-a08f-5fbe61b32b43", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:32.029229Z", - "iopub.status.busy": "2023-03-12T19:47:32.028882Z", - "iopub.status.idle": "2023-03-12T19:47:32.061033Z", - "shell.execute_reply": "2023-03-12T19:47:32.060524Z", - "shell.execute_reply.started": "2023-03-12T19:47:32.029205Z" - }, "tags": [] }, "outputs": [ @@ -426,7 +404,7 @@ "output_type": "stream", "text": [ "CPU 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz\n", - "GPU Intel(R) Iris(R) Xe Graphics [0x9a49] (iGPU)\n" + "GPU Intel(R) Iris(R) Xe Graphics (iGPU)\n" ] } ], @@ -442,16 +420,17 @@ "execution_count": 10, "id": "bf5194b5-5e80-4597-85d4-fab72fbed2fa", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:32.061881Z", - "iopub.status.busy": "2023-03-12T19:47:32.061689Z", - "iopub.status.idle": "2023-03-12T19:47:33.067283Z", - "shell.execute_reply": "2023-03-12T19:47:33.066928Z", - "shell.execute_reply.started": "2023-03-12T19:47:32.061868Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to GPU ...\n", + "Setting OpenVINO CACHE_DIR to /home/helena/.cache/huggingface/hub/models--helenai--distilbert-base-uncased-distilled-squad-ov-fp32/snapshots/a9da64102a84c4b3f110c4d627937a110e56257f/model_cache\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -461,15 +440,35 @@ } ], "source": [ - "# Compile the model on GPU if a GPU is found\n", + "# Use `model.to()` to compile the model on GPU if a GPU is found\n", "if \"GPU\" in Core().available_devices:\n", - " model.half()\n", " model.reshape(1, 28)\n", " model.to(\"gpu\")\n", " model.compile()\n", " print(ov_pipe.model._device)" ] }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2538c02e-fa35-459f-90fb-e6667ef8747b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to GPU ...\n", + "Setting OpenVINO CACHE_DIR to distilbert-base-uncased-distilled-squad-ov-fp16/model_cache\n" + ] + } + ], + "source": [ + "# Set the device directly with `.from_pretrained()`\n", + "if \"GPU\" in Core().available_devices:\n", + " model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\")" + ] + }, { "cell_type": "markdown", "id": "8e3a06fd-e51b-404e-b82a-2557e3db1375", @@ -507,23 +506,48 @@ "source": [ "Audio classification is the task of automatically categorizing audio data into classes or categories. See Hugging Face's [audio-classification](https://huggingface.co/tasks/audio-classificationhttps://huggingface.co/tasks/audio-classification) documentation for more information.\n", "\n", - "In this example, we use the [MIT/ast-finetuned-speech-commands-v2](https://huggingface.co/MIT/ast-finetuned-speech-commands-v2) model to do inference on an audio file from the [speech commands](https://huggingface.co/datasets/speech_commands/viewer/v0.01/test) dataset. You can try your own audio file too. To see the classes that this model was trained on, run `model.config.id2label`\n", + "In this example, we use the [MIT/ast-finetuned-speech-commands-v2](https://huggingface.co/MIT/ast-finetuned-speech-commands-v2) model to do inference on an audio file from the [speech commands](https://huggingface.co/datasets/speech_commands/viewer/v0.01/test) dataset. You can try your own audio file too. To do that, set `audio_sample = /path/to/audio_file`. To see the classes that this model was trained on, run `model.config.id2label`\n", "\n", "The model pipeline needs ffmpeg. On Ubuntu Linux: `sudo apt install ffmpeg`; see https://ffmpeg.org/download.html for other OSs" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "570fd296-15a1-437d-97db-76b91407dc3c", + "execution_count": 12, + "id": "6b725d0a-e230-4b0e-b6b6-3d6d81eb984d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + } + ], + "source": [ + "from IPython.display import Audio\n", + "from optimum.intel.openvino import OVModelForAudioClassification\n", + "from transformers import AutoFeatureExtractor, pipeline\n", + "from datasets import load_dataset\n", + "\n", + "model_id = \"helenai/MIT-ast-finetuned-speech-commands-v2-ov\"\n", + "model = OVModelForAudioClassification.from_pretrained(model_id)\n", + "feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)\n", + "ov_pipe = pipeline(\"audio-classification\", model=model, feature_extractor=feature_extractor)\n", + "\n", + "# streaming=true enables loading one item from the dataset without downloading the full dataset\n", + "dataset = load_dataset(\"speech_commands\", \"v0.02\", streaming=True)\n", + "audio_sample = next(iter(dataset[\"test\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d3012174-b58f-4efb-bb73-a88dbfb20392", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:33.068965Z", - "iopub.status.busy": "2023-03-12T19:47:33.068575Z", - "iopub.status.idle": "2023-03-12T19:47:38.120307Z", - "shell.execute_reply": "2023-03-12T19:47:38.119775Z", - "shell.execute_reply.started": "2023-03-12T19:47:33.068947Z" - }, "tags": [] }, "outputs": [ @@ -532,7 +556,7 @@ "text/html": [ "\n", " \n", " " @@ -547,29 +571,28 @@ { "data": { "text/plain": [ - "[{'score': 0.9999880790710449, 'label': 'down'},\n", - " {'score': 7.452485419889854e-07, 'label': 'five'},\n", - " {'score': 7.436851205966377e-07, 'label': 'go'}]" + "[{'score': 0.9999935626983643, 'label': 'backward'},\n", + " {'score': 3.4823816008611175e-07, 'label': 'forward'},\n", + " {'score': 3.3890643180711777e-07, 'label': 'wow'}]" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from IPython.display import Audio\n", - "from optimum.intel.openvino import OVModelForAudioClassification\n", - "from transformers import AutoFeatureExtractor, pipeline\n", + "if isinstance(audio_sample, dict):\n", + " audio_data = audio_sample[\"audio\"][\"array\"]\n", + " sampling_rate = audio_sample[\"audio\"][\"sampling_rate\"]\n", + "else:\n", + " # if audio_sample is not a dataset item, it should be the path to an audio file\n", + " audio_data = audio_sample\n", + " sampling_rate = None\n", "\n", - "model_id = \"helenai/MIT-ast-finetuned-speech-commands-v2-ov\"\n", - "model = OVModelForAudioClassification.from_pretrained(model_id)\n", - "feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)\n", - "ov_pipe = pipeline(\"audio-classification\", model=model, feature_extractor=feature_extractor)\n", + "display(Audio(audio_data, rate=sampling_rate))\n", "\n", - "audio_url_or_file = \"https://datasets-server.huggingface.co/assets/speech_commands/--/v0.01/test/38/audio/audio.mp3\"\n", - "display(Audio(audio_url_or_file))\n", - "ov_pipe(audio_url_or_file, top_k=3)" + "ov_pipe(audio_data, top_k=3)" ] }, { @@ -586,16 +609,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "5207a5af-3b53-43b3-ae5e-5b352c0f08d4", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:38.121136Z", - "iopub.status.busy": "2023-03-12T19:47:38.120930Z", - "iopub.status.idle": "2023-03-12T19:47:42.382362Z", - "shell.execute_reply": "2023-03-12T19:47:42.381733Z", - "shell.execute_reply.started": "2023-03-12T19:47:38.121116Z" - }, "tags": [] }, "outputs": [ @@ -603,18 +619,20 @@ "name": "stderr", "output_type": "stream", "text": [ + "Provided model does not contain state. It may lead to sub-optimal performance.Please reexport model with updated OpenVINO version >= 2023.3.0 calling the `from_pretrained` method with original model and `export=True` parameter\n", + "Compiling the model to CPU ...\n", "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "data": { "text/plain": [ - "[{'generated_text': \"Hello, I'm a language model, so you might consider changing your code to a similar type for Python 2 or 3.\\n\\nWhy Python 2\"},\n", - " {'generated_text': \"Hello, I'm a language model, so the second statement is true or true. You can also have both. It's not strictly necessary, but\"},\n", - " {'generated_text': 'Hello, I\\'m a language model, and we\\'re seeing it all in the Java world,\" he added.\\n\\nSo what might happen next?'}]" + "[{'generated_text': \"Hello, I'm a language model, so I'm really interested in programming. I've always been a programming student. But for a long time,\"},\n", + " {'generated_text': \"Hello, I'm a language model, because I don't think I ever spoke on paper to an editor. I'm simply someone reading a paper.\"},\n", + " {'generated_text': \"Hello, I'm a language model, I understand. Your mother was talking to you. No, well, my grandmother had said that she heard '\"}]" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -648,19 +666,19 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "97569002-0c0a-4208-9b63-b2bab209fb81", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:42.383244Z", - "iopub.status.busy": "2023-03-12T19:47:42.383095Z", - "iopub.status.idle": "2023-03-12T19:47:48.887705Z", - "shell.execute_reply": "2023-03-12T19:47:48.887095Z", - "shell.execute_reply.started": "2023-03-12T19:47:42.383228Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "image/jpeg": "/9j/4QAoRXhpZgAATU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAAAAAAAD/7QCEUGhvdG9zaG9wIDMuMAA4QklNBAQAAAAAAGccAVoAAxslRxwBAAACAAQcAgAAAgAEHALmAEVodHRwczovL2ZsaWNrci5jb20vZS9BdlNGTzd3bFF3TWxuRUJBbGdJUFpybkxmZ0Y1WTU2alRjR1IlMkJFMGlPZG8lM0QcAgAAAgAEAP/gABBKRklGAAECAAABAAEAAP/bAEMAAwICAwICAwMDAwQDAwQFCAUFBAQFCgcHBggMCgwMCwoLCw0OEhANDhEOCwsQFhARExQVFRUMDxcYFhQYEhQVFP/bAEMBAwQEBQQFCQUFCRQNCw0UFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFP/AABEIAQsBkAMAEQABEQECEQH/xAAdAAABBQEBAQEAAAAAAAAAAAAFAQIDBAYHAAgJ/8QARBAAAgEDAwIFAgQEAwgBAgYDAQIDBAURABIhBjETIkFRYQcUMnGBkQgVI6FCscEWJDNSYtHw8XKC4RclNDWSokNTY//EABsBAAIDAQEBAAAAAAAAAAAAAAACAQMEBQYH/8QANhEAAgICAgEDAwIEBQQDAQEAAAECEQMhEjEEEyJBBVFhMnEUQoGRFVKhsfAjM8HRBuHxQ2L/2gAMAwAAARECEQA/ANEE19VPGDgg9BoAkCHQA4JoAcEOggcE0WCFCaiwHBDosgXZqAFEY0AOCaCGKE+NAouw6AF2aiwFCDRYC7NRYHtmiwHBTjRYChNQB7ZoA9s0ALs0Ae2Y1FgLtGiwPBBosD2z41FgLs+NFge2/GiwG7BosBdg0WAmzRYHtmiwE2amwEMZ0WAhTGpATZ8aAE2DQAmzU2Ahj0WAhXRYDSg1NkibNACbDqRhpT40EUNKDQAhTGgkYUzoIGlNBIwoPUaAGFNTYE4j0WMPVNFgPCcaLAeE1FgO8PQQOEfxoJFEeoIHbOdFkChMemiwHBM+mosBQmoIYoj0EC+HoIF8P40Ae2Y0AKE0ALsOoAXZosDwTGoAXYNAC7NAHgvxqAFCZ9NDdEpWL4egg94egD2w6APbDoA94Z0AIVx6aAPbfjQAhTOgD3h6AG7NAHtmgBCmgD2w6AE2akBNmpsBCg0WA0posBPDJ1JIhjx6aCRvh6CRpTnQAhTU2A0pqbAaY9SAxk/TQAxk0ARsmgCwE+NAxII9QA8JosB6pjUWQOCHRYDvD1BAuzQTQ4R/GggcE1Fge2aLIHbfjRZAu341FhYoQ+2iyD2w6AF2agD3h6AHBBoAUR40WB7YfbQAvh/lqLA94eiwF2aLIHxU5kzyy4GSVGdUZZUl+5di23+w3ZnVxULsHroATYNAHtnGgD3h6AE2fOgg9s0Ae2amyRNmiwEKY0WAmzjtosD2zOpsBNg0WAhTRYCbM6AEKfGgBuzQAmznQAmw6AE2n21JI0roskQpnU2SNMfGiwGFP31IDGTjU2AwpxqQGMmpAsBNLYxIqagBwj/XQA8J8aAHBNRYDgvxqLIHBNFkC7NQAoj0EMUJoIHBNBAvh6AF8PRYCiPnUWAvh/GiwFEfHbRYHvD1ADtmgD3h/OgBQmoA9sGgD2z40AOWSKmYPMwVT5QT7ngazZ37V+6NOCPKTS+zE2YzxrRZmPbPjRYHvD0WB7w/jU2AmzOgD2z40Ae2aAPbNFgJs0AJs0Ae2akBNmoATZ341ICbPjQAnh6AEKaLATZosBDHn001gNKaLAbs0AIU+NBI3YNBNjSmgLGsvvqQGlM6mwI2j1JIwofbU2BOCu4DcMnsPfS2hyUJ20WQPWPUAOCagWx4T40AOEegBQnOiwHCP41FkCiP41FkDhHosgXw9AC+H8aAF2fGoAXw9Fge2aLAds1Fge8POiwF2fGiwF2c6iwF2aLA94fGiwPeHoAkhsz3mRaeNWdgd4VBycc/6ax+TkUIpv7m3xU3J19mMMetVmI9s1NgJ4eiwF8POiwE8PRYHvD0Ae2caLATafbUge8P40AJtHtoATw9ACFNACbDqbATZ8amwEKfGiwE2aAEKakBNh0ANKfGgBCg0ANKamwGmP41NgN8PRYDdmgBjJoJGlP10AMZM6ayT5JivlZYbgtRBcJ3MQzG4kLEHGfX89fOIZskalBtM9pPFD7aNr0X9Wr/AGylkjqnNTEXLK9wOWGfQHg410IfVM+HTfL9zN/AYsm2qOi9OfWmgqQsF7h/llSow8oy0Jb/AKT3118H1fFOlkVf7HOzfTckNwdnQ7bcaS7QGaiqI6mIHaXjOQD7a7MMscq5Qdo5M4Sxupqi4I9OIO2DQBCKum3Mv3EO4HBHiDIPtpeUfuTxf2LQTPrqSGhQmixaHBM+miwoXw9RZNC+HqLIF2fOiyD3h6iyaHCPPposKF8P41AUL4fGiyD3h6LAUJoAXYNQAmwaAPbNAFO4XRLaVRCxqWUugB2hcepPr68aweVK6idbwcXPky6Y8knGPXW6O0jlyVSaPbNSKe2Y0Ae2aAE2aAPbBoA9s0AeKaAE2fOpATZ8aAPbPjQAhTRYCFNTYCbNFgJs0WAmw+2pAaUz6amwGlNFgIU/XRYCFfjU2A0rosBpTQAhT31IDCmgBrR/GpsCMpqbAYU+NAHyxfZaimhpg1rp6qBlBgJg8yD0U+/GvlMGt0z6HPXwXaZ6+WBrZWxxQ7m3sZYgqkkejDsQMDUNq+SHjbXFlCo6fqKSOoeVlp4YyAqMdzvkZ3DHp+eNWetGTSfZV6b3fRND1bWWKSFKKvljVFAKwTN4ZPv+etGHLPEnwk0Z8qxtpSVhGX6k9RtRrTy3urIZcluOQc8ZA1sfneQ+psqXiYe1EIWLrq8RRw0lzvUk1LJyqyHJUf4Szd8Z+dUZfN8hx4KbLoeLhjK3FEN4raamvzv9nVKajCSSNkZb3XnByMHWGMsko3yNkoY7riaun6vufQkcarVyS04fatDV+YMDz5G75/M66Pi/VfIx1GW0c/yPp2Ke1phml+vsYu6U1ZavAp2ABZJdzq3rn0xrvw+rRk7lHRx5fTJJUpbNXR/WDpipVd9a1Mx3YSZCDwcf39Na19S8d/zUZJeDmTqjRdPdS2zqilae3VInVThlIKsv5g862Ys8My5Y3ZkyYp4nU0LeOqLTYHCV9bHBIV3hDksRnGcDS5M+LD/3JUTjwZMv6I2VJuvunaejmqpLrTrFEgkfJ8wB7cd+dU/xvj1fNF38HnTriMp/qJ0/VUS1UVepjZC6rtIYgew1VL6j4sVbmWLwPIl/KBH+slrWlttSlBXS09bIYxJHHkR4OMt7flrP/i/jcuN7Ln9MzpXRpbX1jaLsyCCsTMgLRhgQWUevPb141YvqniN1zorf07yUr42FaCvpLpEZaOpiqYwdpaJt2D7HXShkjkXKDtHPnCUHUlRZ2aeyscseeANLZNC+Hg9tTdkdCeH8aAFERYgAZJOMAahtLbBJvSHfyGO6Vq0s9NOZxDKwimUxbfKDu54b41xfI8lSftekdzxYSxL9z0tMaZ/DIxjGOc8fnrq4ZKUE0cnOmsjsbtOrrKD2zRYCeH8aLA94egD2zRYHimiwE2/GiwEMepsBPD0WB7ZosBCh0WAmz41NgJt+NFgIUzoAQp7aAG7TqQE2Z9NFgNKamwG7DqQEKfGoskTaDnjUkjSntoChpT31NkUMKakgYyfGpAY0epsD41slBNerLDU08oqjG5UKshJ75wV18syPhLi0e+inONply9dYVYojRyAM7gIRJlWJBzkeo44/LSRxL9VkzzOqLP8APa37bxI5t0TxmKSA53gAeUgnvrDOSVXp/BcsjasdbLK9cYpLgu4OMkI6ofyH+Wpj5UZSaTVEKKkrka+paktUVvllSSKGn80dLHtBTPfd/wA+e2rIz5t0W6ikTUfVFtuJSNYoPuWYqieGAdg5xkjT8ZJk8k/3BF7e5o0NzoqGKcBg20HxMc+qntjHfV0XHcZMqnyW4lnqm73WKLxEj8eAzRsEFOWMa7c8nt30sIxerGnKSVgiwzv1DcJVZKaklWVnMsx86sewI9e3bVkm4Irj7zS1VigqJq6m/wDy43F08SNvDwqjgnkZxzpFkeu6LHD+4Is90vvRdVK9CBTTOpiaZV3hlPIIJ9PbW7F5E8T5Y5UYsmCM1xlGxtV9T7nDVRVFf9vVTI3JniBJJ7njWXPyzycpttv8lmNrDFRjpIPdF9cUfUd4n++Sl8B1KeCsAwR3ByOeNYcsJQguN2asc4yezbdNRWyWumkiroamhnwtMiIMxSZ/ArD0PsdZMrmoq+/k0QUW+9BQXdp7tRW22yUdKs1Qscsa43BGYAlRjvnUY4X7pWwnKtROxda/RX/YO1dTXanrLlLS0VRTGhWpEb+KshVZBuABwGY625fHjTa0ZYZpXT2aGq+j0digqzT3hK6sg8B6unelEQRZCAGVhyf108Fm8df9DI0xJ+nm/wC7BMz9T03VN1Bf7RFCxazR+NNUo+I3X/pB5z8a0f4p9Qhyipp8fwZ34HiSp8asF9RWW6WqB1npqmKIRq8syoSsasPKSR763R+veRFVlxf2M3+FYW7hMggvi3IK0zwrUGMBUgi2BsDA49Dgavw//IMOOKjKLKc30eWSTlGS/sSJUbkDbMr7g66Mfr/gvuTX9DC/o/krqv7lu23yG0XKkqWKEhtwRsEkevBB/wAtNP6r4fkwljjP/wABD6d5OGUZuIOqPqFabhDcElkmoqtVMaP9uv8AUYjIC88bQB3HPxrFDc1waaOgofE0wpLMK+GlqhE0QmgRsPjJIGCeCe5B16DxZVjpvps4XlwrL7URFNbbMFHvD1IUe8PQRR7ZoJo94egihNmgKPbMamwoTw9FhQhTHposgQpnRYCbBqbATZoATYdACFdSA3YNTYCbNFgIU0WA3Z8akCOZ46dN0jrGucZc4GdQ2ltkpN9GI63+o9F0y1LF9ygaeN5V2+ZmC+gHzrhef9Tx+I4pvs3+P4zndoj6M6vi6gjpa1qmICXesgL8rxuXd6DjP7ab6b5y8vFzk6fyNnxcLikbaKWKoGY5EkA9UYEa7ikpbTMFNdisoAznH56myAdVXmko3lFTIIEjxmSThST6A+p1TPPDHfPSRascpL2k1PPDWQJNBKkkTjKupyDq2M4zXKL0VuLi6Y4oCODn8tWWRR8q9IdNvR1FJX2aWOYYxIkjDBB4PBH+evlOaV3GZ9Ax41FqUGVPq70cftaOpE6KWfBZRguxPAA/tjU+NmULsPIx3FbB9ipyr01BO7FXjJZYlICDb5VJ9zk642byVmyS9Ovb/wC9sIKkkzTUVlgu1KhWWRGomIcO3IUdyf3/ALaqh5McfJxLeKlpfBmutLheoaoAU6CiLCWGSMhgsa8AEnnPvrs+F6U4e1lM3O+tBDo2equdMlJCtPCZGJesZQ0iA5wB7emr8kFB9jxbl0dKqKemtNvaOHw8yIRI+MhzjksPnWZPk6NDVIxN7vl5udtkpYabwZ5EERdH2gc5B+Ma0JRjuRncptUgwbTS0VqqbhJH4cslGTPUBlZGYAcfJzznVHNtpIt4pKwP0/QU126de50bVK3CKXwEEQyHVsDa3x86unNwlxkVxipLkg1UdMywX+jq7o4pIjCNyO3AcDjawPbVfrLg4xHeP3KUjKfVOxPWFZ7ZSx1hKjxqinlBYY7eT/XVniz93vdFOaDa0iP6T2WXpurkud2qloqZYyuwjMj59h6as8uUci4w2yPHx8NzOhwzTXe60NxtsqwQUBkmeHwTEWyvlZT+Fjgd9c11GLjJdmvbknF9BGxXW3V6L1gluV62FDIBM2MlDkH9xpHzxSWO9ErjNObR0aL+IjqLr36S9WSXiiNLcLhUpPDVUrgwxJEVJVYyc+YL7+p1ryZYqXB7ujNCEnHmtHSOk/4k7L9Sem5brSWaqo7jcZqemrN8+6NEgYHcmB378cd+dLnzxwycZInFilljyXRpf5503cLx1hU01/loKq+0xiRblSFI6dzjBDDO4apjmwTcny/UWShlikq6Og2e8U8kVitplprgKtZYJ56dg6MsSeUn4PzrdCUWkk7Msk022jN9V9O9JteLtJU2WnrmtlHTSCMOYVP3E7LuLL64HH5aWWHHduJKySqr2AK36F2hJK+30N8uNuq6m4TxULKqypH4cYfa+7kjGfnVH8LjbbZb60qpAH6efT+19d9Kx3CpuM1FPNcGt0CTQZWSUDI+QCQf21kx+Lzi25UaJ5uLqjn96vUMFfLSrSR3D7aolp6qUR5ELITuOcZ9Ow99ZVHJhk0pU0XXHIk0rQO6c61sXU1dJQ0VW/2qFGjliJ4fJ8roRwOO+usvqHn+ND3u1+f/AGYX4nj55Wls1UN3vT3mvW5WiKGk374KqjkDI6k/8vcY9td3xv8A5FhUUsyaOJn+j5HJvGwlHXU8jFfECOO6vwdejwfUfF8lf9Oa/wBjj5fB8jD+qDLIXOuhZhao9tzqbIPbNFgIU0WB7ZqbJQmzRYUe2aLJIaiaOlieSVwiINzEnsPfQ3XYcb6BqdVWWScQrdaNpCniBVmU5X31X62O65L+5Z6M6vi6LVFcqO50xqKSpiqIASpkjYEAjvzp4zjNXF2hHBxdNbOH/UX+IHDS23pwPFOkhR6twMEDvt9ufU68f9R+tuN4vG7vb/8AR6DxPpqdTzdfYwS/xDdV2qM7ayGqZmxtqYw20fGMa5WH615kLUnf7o25fp3jy/SqND0f/FNWnnqS1p4LMB4tN5CgzjlT311sH16XWaN/sYp/Sotex1+5v7f9fLHcalo4+VycDOCRnAIz/lrqx+s+PJ0jG/pmRLfZoqj6o9P09TNB9xJI8QBfw4yQM+mfXWuX1LxotxcuimP07yJK0gxaOprXfYpJKOrSRYzht3lI/fWiHmYMiuM0Z5+HnxupROa/Vuuiu9ba4Kesp/DO5hIZhhHUjBK/mMa4n1PyMc3CCmqd/P2Ol4nj5IKTlF2YTrboqbqqODqVCsC01IIJojh33ZHnCjnGfXXzr6p9RjmyKMF0t/v8nexeJOEXNnNrdfZ7VdK61VVRsJJEYjU4JC8A/oSdZIZckYcsbaT72VcI8qmjS031QvXTKxvTVZhp4VP9Pb5ckeo9e2ur4/1jzMUlxlpfBXLwcUk3Rj+q/rn1DcayWYXKWOCVdojiyAjY5YD3Ouk/qnmZnfKgh4mGCqiTp36yXatraeGtr3r1LBSKznA9xpIeZ5MZ3kna/OzQ8GLjUYm7vX1C6jqbSs1rmnWjRj5o4c7QDjHHvntrb5H1PM1H08lJv/Ywx8S5P2hr6Y9cXzpa1Vn88SonhijXwI2U+YnncP75z663/Tvq0sUJPNK0ujNn8CUppRVGMo7xXWyoBtlILdPLH5lfJWTHqM8c689myRVc38noE2v06ZpbjYT1VZKFqqqKVf8Ax1QeYqw75/8APXWfJ5UMdx++i1w5pJ9mdo7O1LbrjWQzeI7MY5mVsgAdhj0OuQskuEYQ+P8AUlRqy509Pb6e00qq1TEauJNxY/i8vA57gknn40mPI1kcmtEJpL9wlbemKK52yhhr4BUBXeJPHfB45AOO/I1uweQo5VwdJ9V0RxjSshpqc9Dq9sKwLXV8wmpoV3LHt3chj+g11Oc57k+iEvT9vyzbS0UEwD1MRXK4ZFOR84Oq1liumXuLfwAbp0/ZrjiCSKeCZB/xEyvA+R/rqxZX2VyhGWmWZ7faulbBLJieeOUBBFVvuBXOe2MftpVJylomo44Ef086YaNVNtZ6OGQM6NHVgxIT7jt+/bGozZb3PY2GH+TRmL/DPeOq6YRqlakEgirdkxEcmCRvwTwD28utGNxjib+/RTJ3Ov7gfpqiuj9dVFuWgqKWBlli2qhZVB/D5vbtzqzJKPpKd7JipPI41oI9VW279WVESWmhNT9oohlLEIzleCze2fbSYZRgm8j7InGeR1BGspLhJ0zQWW1Xdik8i4enghZ3YMfKEbtx2Os1c3KUOi3eOozNTR9KUQsVfboI5qY1iuHBJJViMDAPbWV5W5qTfRdGCpxK3R3TJ6Os0dveeeqPjZYnnOT3x6D41ZlzLLLkRjxuC4o3dqqYp4DBBF9i9OOV8DwlbIzkD1PvrnSuLt7TNyqSroA1vXlPRX5bRU1UqSuEKu6YUhu2D+etSwpx5pGVz4y4NmrK3mCiFRbK/wDlbqQWk2AkLnzevGRkZ+dZFOMJDOPJUWv/AMcrr0Vd3q3SY0syQ0prSFnidB+Dejdgvp+etmLLmTtTt/ZmfLDGluNL8C9M/wAVNTSrVRzU8F3uc1TVVUFTch4TUssilQFEfDoQAMcEAjXTjnf/APRf2Mjxf5X/AHLH0G+tVDauk0t9ba6q+1dvuT3JqvxvB8OYIQDtxyo5GBzz20nrRwR9y76od4pZZdmdtlVBe7rWXq3V0stsuVVI8lAKUqIpWJLEnuBnPJ751yc8m2+a2boRUV7XoAdc01J0SUuMFIHSaoJeTABQlfwhh2HqM5GpwN5bg2JNLH7kMoOsq6tWgm/mhKU8RZqONBy/oS4+D+Htq54IbVdg8jbTsJ0PV8U04kq6OSlrXJikdiCr7RkcexB7jVM8DWoy0NGXzRrLH1jRVlMmH2byyCGQ4Ykew/112fC+q+V4L4ZPdH/b+pzfK+n4PK90fbI0cVypAsStNFC7AYjdwDr3fi+dg8uHLHL+nyeP8nw83jyqcf6l3broWYKPFcd8aLJIYaymqHdYp4pWQkMqOCVI7ggdtQpJ9MZxkttE23Opsh/k5d9WPrPT/T8PS0dL9/XhMud2I4c/h3e+fbXH836lDxk1Bcpf7fudXxfAnm3PS/3Ple99edQ9bXVfvbpUDxEKmCMlYwM55wdeNzeXnz+6ctHpMXjY8dKKBnjskphEhDou3KDhdc63BcrNLddnSOmfqxWdK9G1lpt9OZJpQSZWYbVLdzj1J11sP1j+HwrDiW39zm5PEjnyc38HIrqbzV1NTM4SnAYMvpkEfGufcZys6PDitGVbqALVMauWVNsmMDkE6veKuhE1ezT0NHcuqaqKGipzK7gSHxOAoB/EfjWaUVBXZa030jff7Jw2mmJmrRUViDtDwFP5+uudkyyb10WRg3LQMmu1VBTlKeQu4QnLEgkZz+p1EJNP7WX8X0j1m68em8WJ6uWmIXL+ICu8eoHzqZ+PN7T0UtpA+bqf+YsSoZFVuJSCP8++iWCSVsjk+zTdNfVeewCSBB46OBzJ6HGD/wCtc7J4vN8kWLLxNrYbzY+pK01C2+k/mSrvZ2iG5gPY6VYpR9jeiyot3Rl/q903WXPpypjssDSzL/WZI1Vmf3AxySMjWjx4rFlXqdGfLBuLUUfPtq6T6ruNdDDT26omkkyFLJjAHcn2/XXoFLHLUTnccifRsT9LOpLbPLGj0VQEP4onIdjjJAyOMDUSSlsv4Side+ilXPT9K+BW7gplbaQpZBzjue/bXnfJ/wC7+DZjXttnQVorTUVErrLVSFzxFLKQq8f4V/PSOSqkX6OZ0zQ0dRJR0oPlDvF4zbjnHAHwda/IeWa55f8AQxLjF8UaKqqa2rtsUEKGKVQAxjxgDjPftjOuas/qur1Gy6O9mKqo6iy1tdR0rssSEqj4PmdjjDH1GPU6fBCGVcm9/Aj06RXsRlkq6i3082yKiCpx5/DXPpnjHcasxSjjleXaf+pXxbXZ0OmpxC0FRKJayIssscakAxsAe5H76nx3Hx5XCNp/6FqjW+wtPb6fqa5U9dPaGkqoBtg8YhgwPJI29tdp5pQjxWm/9Bv1u6J7etZW19WtawpooiVjVvwqR3Ug9hrHhcUtytliUm6ZUlrqrx1HgCGKMZkjbB7nAIYd1P8ArrprjRW27LV1no5aKRrkCkCMYyh5B4yNo9dEbv2hLr3GN6dudts08lNSVElGkrhxUKMfv++r53Km0UxqOkXus6W0xQSVtHSiWYhh49PKUIbHGccd/wB9TjcrphKMe6KPQHWFRPR3erWoaVaQBp6Nx+FO2Qe+c6M+NR4r7j4sjdtAmxXi50fVlXDZHq0SRzIxEbFAxGWXJHIBOrnGLxpzKoykpvgP66r+pKXqK2105luJoxGE+3jOM7s+YDsffTYceJwcVqyvNLKpqXdGy6f61uAaukW0v4STmYN4zPLuPcBTyR+XA1heGHTZqjklfRbg6qqj1HG1yopUo52SOKRI9rxE8EkkjAHJ/wAtLLEuHsY/O5XItzG426rnpzXVktOYyiTMOe/BBPZsDudS1BxWg9yb2ZK3zV9k60nrb+lU8FLCVpqyRgy4JyM84Omyyxxw+1pfgrjzU7mv2KA+u9wrKqoghqvuWORJS+GNoHbt7a5krlC1Gn9x/Ue9leh+sU9AIbXVmmqqHLLJRyR7iV74z6geg1kUc8Xzuxll1xKVI1RV3+ir7VItRTy1ah12lFiJbjOPTXoMWaOXFJSVNFEm4yVdH0XU25IquarNVUNE8JQU0DBYGyOTjGc/OuRGVri0bWl2S9IVVLZ6OnjpoxSw7SNksoYj359Tqcqc9sMcqKf1DhluMtAYL7DbrfJkTUzKCs+OSN3bt6ftowNQT5R2TlVtbo5B1MlH07GgtcuaefEvgSSb/CYHsreoz8a1vyfbb7MjhXRlLz1zcqwU/wBzUSlFJ2uq52n8xrI3Ka4xYyk+2HLB9WaGwUbQ/eR/dtL4ilzuMKY9O5XPPPzrqYsOTKryKimWWOPSKd//AIiununKiqrJrbW3GCrjCTR+7g5JDsR29Ma1LxmkkpU19it5k7tXZrLb/EtF1hYhVdPV6xJCixvSVY8KSNgOCSSc8A851pjl87Eqx5L/AHMs8Xj5e4UBrp9X+oIq6rS4180azQqJFZsDbngj259RrPLzvNi3zldkrxcH8q6AtrkukTNerXVyRJUllcxv5cgjOfU51y15k8U3JSpv8mxwhONNIIXz6q9RWO3zUdN1G9Qm8M5PlZD6EH21pwed5MouCmyr+FwN8qujJ3K+TXjFZUMa9qiJd0nIycdz+Ws/KTk+b2zWtdFmkMNlolkNFHvkGPEY4DKex/THfVPNvTGelYOaNZqpJlSNIiRnz5DD1J1c2nGhEr2wxUUhFOrUlLuQ43MnJHBOePTWeosdwrotQR111oWiq1QQbBtLqBgdxyedLzUJdjqLa2ZepNrF0D1VFFKiuB5MEEDjnTvPNLUihtJ2b8zW/qPw56WZaOnXEKxRAKxb3wPT/PWWbt2Xr3la/wDTlTT0dVHQxSVVMjFZqiXyEFQM9+4+RpHNRatl0MUlbSOfQViyyyQrUJDUs/hIZm8ufz9da64pNiKbX7mwhsFPW08S1VLAahG3GVM+YAYJxpYzlFXHomVSf5AHUVHaqOdPAimQxDzusmFLfl2xo5Tnozy7oA1NZFMyqqI6kYPHJ1HBx3YtKw70TXrZr9DVfbPJEGC+XO3njOPjVE7ey6Mfk192EVTVu1sqZqaeOUNMEcgSrnJXjUcVJOyzi29aCVbJcaiqmahqKelp3VMQRyf1HyOQzHnjtq3BOOGKQuSG9EEHST3JIjWVypO7YMMWWDAHgsfTWp+RH4K3B0bey2aOzwVVLD4FRTn+pGiZ3BhyOPfWFq5cho6TiWrXUNSAz1EyQPNlUEy+Ju9+PfSU07BJmGr1jpatBEkFVNDCkZaPkAE8gnJ5zzxq5SyencloqlH3aGUN6ihnqKlqjZURLtUDlWO7zAgjjGvMpPG3we2Qm9pnrxXTta7jUrVUUNNUU6RkgZlMh/EcDkd++uzgzwUEqRLXtbTMGXirbmxsxENWkYZ3jwEniVlDD8x3OdLmtYeWTX/NFDTl0bi2dU0tH4yzRNUQyQpJCjPhBg4LAe5/76zePPJB7XZpjNLTNZ1F1VBJSrHShY6rhNkA2xjjvu9DyBra82TTl02WSfLrswt4u9za4Q/zCrJL+fwkXhyO5OuhCGGM1NR/Z/BlnOTCP3d3SqEtStQwk85jCDbt9MgHjvxrcuVVS/oyE53bI7lcash6Kk8d4kk3RR1eS4Q8k7W7jOR8a1RpbIlfSBriknZEpLXMKqIlhCJM+KxI4x7d9S3L7kqvhG4s966coEe3PTpQRVkfiTeLl1ViDlST2wdZXzl7u6LvatPQX6PjtcEslvp0ZJkkwwjhTFQDgq5bGSMevbVWWU+y7Eo9M0s12h6fkgM9XSUsTrsWB8M7MM+RQvqSRydZN5LSTNd8Kt6M7T9SWJ7itRPE8dyeb7eVFQ/0s8ec8duTkZ1rlDNxXF6M0cmNybktg+urKKO7KKaHZVLUMs1wLEsYwMArngDvxqyPOcW5vVdFUnCL9vf3A/UN5t0tXHfaOompZ3V4AUTfEsgxtfB5yRn99VcvSjwlsiUub5LsDH6z3WhSp+5navG3/jPCFb2z++ojT/THX7i+pOO2wZ1hdrb1J0kq1VPPSVKPnx4nLrMmc4OTkY1U8snJWiG00cuqwKb7aotgmWCQ4lqGweQcADtgenrrVC1GXNFL/A6jr4KysFvmjJE6lvF5B/MH0xoqoJ/I0UnKmdH+nl2g6LiqYKb/AH6mqmBnjqm5yMYII9tVtz7NPGC6fZob71/dOoKV6aCOWspaXiKSnyuDkfiP+LA4zqMepN1RE02qWyleeuKtpLY32q21YvIIQpBJ9cg++jlTd7Kt6DE92jTp+AVKiSrEbMpjkLcHJUEHjvquLUvnReotJWj5++pH1lHTz1VOsQnvGNiI4DRwH3YA9/j9/bXSxeHFu59fYx5vIk3o5f091rfuvuoILZV3SeKGoJ3RRHZHtHJyBjjAzzroQwYse4RRkeScu2dapOqui7C0VC1WGk2mORtniRu2Pw5Hc9v/ADOr/wByq/sZD6n3qgu5pkp6Y1cZQpDI9R4UIPHAA/X1BP8AnU5FqVoFfTSrn6GulJV1UMs1NVg09VQiRGDxk43dycfnyO/tpbJUaOpdWwXWO501RZb5BeaJCoktFUyLKIufwv2OO+CQfjSzqa2TTi7RtugLTXVlwgp6VFgG3xCrvlMHv2zrz2bFxblJf1NmNRvR0im6Wt9THUNUx0kyMwBxhm7dvnWfHkaqmaFHXZhurbbNZKyKngp1QHO3wU4PpjWjnytsVx+wVqbLUVPSVLR04KuHLTB1HlHvz2xqnm+VjNfBlrfbYammWnjUqFy5kbkN6AZHzzjUPNTtkRino2nQnThpfFhmnhlKgs8hyEjHtj41VOfJ84ouhBPQQ6i+nFVfVSV6qnaNyuGib0IO0j0J1Cnu2iZY70cb6o+mHUNlnkkqZHajPbcQrAeh4+NdGObBkSaWzLkwzT950L6RUdGaaok+3NdJEFZYZmHnfsuAf3yNc7yOSmt0jZ49cWmrH/XGmvFPa6EQRVkdUud8ijEKbu65/wAWB/nqPEljcm5NFmZZFFKJzv6Z/SWuute94uJNRHCCrRowKxHGA2c4/wDWtnkeYo/9OBnxeO37pGh+/tqbqZppWnppNsrRvkgDt++li5Q2+mLKlpsyVfe6StrK6GKqIiMmSGH4/bn01ZUnUpGVzTbXwBxc6SkrvAmfEi5zsUcD01Lxza5VoRSSZuT0bVz9PJdLTOapvD3NApBYfA5599ValLiaqpWmALfe7tPSySikdEHkzIMOWGtUscIriDnNofZbrLUVySSSPFJGxGSSMf8A31RPHFRtFalZ083avo7BTt4KyLVR743X/Efgj141kX5LbNT9OZa240n39VGYnhdo9hkxvUDB4+DqOL5aLF1bCF0ligkE9TIHfdsRYCcLg6texbORWGBp6OWSlElMGkGF271kA/EWPoe2CNNnV4+OXev2oxQiuNorC4xJdE+9WLw0csQCYyy8gDPrjg8++uPPx3w9pbyT0ErlcfuaanNPDiMSE+DlUZsAeXtyPn11j4OE/T/H7oS/hFmepjS2vFFTQURUYSCn2ufNjBOO2TnnnWj01PJFSbdfcslJVSBdK0NPBVCaq8SAyh2jUhFiUsMqM8nJ9NPLJOD9nx/qEdq7Nfdb7baS2SRwwyMQf6JxtVAGB3DOcnnHOtvjKOVccj7Yzko/pJLF4/UNY8k9KAicxu0YIye2T6n0499dRTUbiRCMm9kx6t/2fuVyje3xx/eR7JINxd6cD8K7yDwO+BjRHGklT6LHNRbVdmeTqlvFqKiSoLTQxkt45LEr8fJzrXL9JnTd2BYepf5lI7MMbQGEqnBA9sjVXKcB7j8mo6bttvudXTTNcKUOqlzT12WBIHCg+pPz20/rSUNr+wKMZPsiufUSy1cDh4oK+BsZg/HtIxtJ9tVxzxV8loXi5dASbqJ4a2JzSSNOjZEvLZOdV5czf/bYcJLs1qdQU0lpiim2Th2MyJtPiI5POT6nSyzZ1HlGP7kt6+5mL1crvaZJAY0heQkiSXz71x6Z4zjQ8vLGkwSa22ZSnuFfWJL4OKinVhwTkKfbGrYwTjcmRFSfQbgoKKeNhUQPG+/DL4mVYew1neTj0x3GyxfOnK22VUDiVWoJogYlkZOR+QPH66aOaHHrZM4cKaI+kbNX3u8NG9vkWmXGzbgBx67f89WZ/wBKa7Iht7Q3qXo5LJcFkiYVMD5d3YhmjPtj8s6FJRpy7Jljbdrop0PT9FDA9TU1U0dL4fAUDO46eXkOWkiFHVvo6Z01LDbunZ6O0VFSigrlpVGXJHJx3xrJOTk7se1pRBHUFiTw46yW4xVtSFDNCqefnPPJ0sZS/Q0NFfc419QeorhZoYbXBXCnrrgT/VZtop4sEk59CQMZHpnHJGux4uGH/coyZpteyz5uujfzGslWMSeIrNvZmyG+eeffXVWjni0kz0YHgHwyYGjZweSCfNz89v30y+5En8Gg6PaK33NJ6oMYghHPGM9vy1DISNBNTJW1Ze1V8UEi81MNQyrERjhjny5x69/bUUNv4Npb16cudPBLDMEliQeJUJF/QL+u31x+mNI0WphWkvdvqbTPS0FTHFUgNH46wCNAT2I3c5J9T76WmPZm5+sqrpOGCSkt8N4uEa5Zq6bhcdgRuB28HJUjOcZGOXi0Q9bNp9DPq7furpbzFdcPT02ySnnRsCKRmOYwTklcAkDJxjvzrh+fFQqUFt/Y2+I3kbTfR3q2X6LqC6wpKrO+xv6TpuDEY5PtjnXMb5KzcoNSoxfW3VUlpu9XSNN9uhUqVD4G0+mT3OqYxnNtx6KpJJ0YW2dTW+lqZnjqDI7MVUK+QBjGSD6/Gtb8fLlSVUZ4zUXo7P0LNQHp6ps10u1NDJcWVkqoCPEjB/wNngHtwONcuUJ45OSXR18UYuPFsjirU6Zu/hpWfzGiiLRoufKE7bwO3fXSj74VWzJJOEvuHKHp2o6wq6mWp3VxWlOKFWAIU9m76zTaxL7Ky1J5O9hH6TdCW6yrVXSesMtVBJuKsoXwgAeOeQfj41k8qblJJdGjBDim2evHVM3Vl0mSpqBDb448mmkXKsMeYk4z+2tEfHUY67KZZrf4MP0ZdDX3tukVo1pqOcttWmj3MFz+I/8Af51VlTS9V7DDOTlw6QA/iU6Et30/orBQ2Zo6SWunklmeWXdKRxyT7Dtz761eFkllnKc90U+ZjjGKSOHXS1R/bsouHgkgMwT1Oe/sddSGZ3uJxWyktmllq0aSeligU5aZ2P8AUHx859NXLJyjqxseJ5Ht0dTtF3m6epF+2rHjjJHnHZm/76xcLfR1oxx441ZLL1ILpUA1j7mYgE7QC37afg4uiOUWVq0xbcROivKuB5fMP19TqHLejPLvs1yXCOqjpokqFSmijQKCcBOeTj99U+nz9yL69to1Ft6ztVro5qaWrWFpDxJjccfHt21ChTKm60ZTqf6sM6EW+JWiUFWmdckn403C3ViOetHumaioijk3Iq7UePEnldOecAHB1w8+X1XW3+2xYV0D7lPS1zo7+QKW87ruRWPYH/4/66ypy5v29iTaegdSytW1CwK8s0UYOZZWXysV744zg5wOfTW2fHE+cVT+wjf8pWopJEuE61LbaQFiQ5DiQgHjA7e+tEJQVTkrYi5WwZJfYq6vqafCwiU7VkjbCg4Ayw9T+vtqHivb0RCbbp6NE933WQIqCsKBR4u3kqO5PsdY+Dc+TdGmGWo7RooOp6ylpoRQ1yqkkLUwduChPJAb0OtWHNNScsisuc38dAGto6qkoZFhqSSeCZvM5PBJJ9cnPOu9DyoZJaRRwe6ZRtsUsdartLF4JH9VSu7dxjI/XV0nGQKL7JrkkdLPFFT28ywOCC6thvz0koyS5cuiJL8F+g6hezqWgqBBMg2lkUAgfGe+NctrJJ0laFxycXpnpaR4KRq9iZKdhuM2MFz76lynL2mmacVy+CGO6PHsmhmK5GFCjOoUa/cJZW4klBcKlamSY+C4JwHLY/PnWlXNUmUbWwqtfT1tVGlZAFVhkOx3DU8PTu3Q3HYTlktFFbo5aSijkeVmjDonmHpk49c6I8ci4mhNJUjnf301TcqqJJVKxnILdwRqXiUexNt0XEqHq3Bil3FcDHJB/TSwgqti19zUWDqWnheDfLNFJE5XxFBBx8Y07ahFrsdPdDurJHNQrjYYKiMuJRyePc++kg1KO+yxtr9jOUVQ0sJgd1nhbhv8Xl+dCVPZW4Pj2bGWgv4ghWlcCk8ESvKAqlUAAAA7k6qfEhJpWAZKOS51vh2+olllVfPvjPl9P0/LTKTXukiFJtHzN/EAs1D1rPQzz+LNTf02wcj8IB/7a9L41PFFo5ua+bKvUv0rvvR/02obve6ans/3s6fawM2aqpVk3ZYA+RQMHB5yeQNRDyIZcjhDddjTwzxw5S1ZkbY8VPtabLdsA/4j8/GtXZl/JanWqpV+9ikEyScuByFGeAdQyVZPT9UyUEqNBg1CnyxpGpQceoII/t+ulHsMP1/NVRRfdsEMPPhxcLJ+foP27aXsblXZWqvqbWyI6xRQRGQgsyxHOcY/5sdvjTUK5NmcvF8nvjK1S6mRRkkDvpa+aB20d3+i9iqLL0upnhaKSvl8cZTDMcARp+2T8btcbyfIipOt0dHxsLcd/J9OfTOgMVUkV6qhbKWamkDshzICSPKD6e3OvOeVeV88Z2MPGK4TPfVX6EWiK9UvUDXOoraVBtNHMoJTK+XaV4Hf11d4fkz4vElT+5Xn8eK99nFa3pqy0E6SGCWmiaQIImXaWHuV7+vfXooynOH5OU4KLLnUt4oen4ofs5GFKDty6dvbP/mNYoQeRtZOzU8qgkosKdM/UPNzoIt8bUm9EaUgbJQ3deRzqvNg4ptdlkM1vZ2f/aqTpasrKiP7IoygCVlAJQ428+gz6a5npRytJI0+q4JshmqKoDq+sp/CimqaVI3i7gSY59eODwdXcFGUF8IXm2pP7mctP1Ago6xLXb6KmrKpIgktRUgsGbHJXPc4yNNk5Tk+L0ZYZFdV0Wuk/qTbrJ1WqQ2+WGaSN4llxukkPJ74zjPYDWbNCTgkzTjyqM/3DlGaT600F1qb509S0jwo8FBUVQIckfiODyOccjUZF/CNLHO/uXRazJ81+xw1f4R+rKyppoZqygprdPI22saoYge4C45Oty+pYV7km2YV4MrW1RZun8OcHRVU0dzuk9wpYeylQhlBHlAUnnnjOrI+fyjcY02WfwcYN70AKi01F0+4iSmeGn8TciLyqHsAMfGmWdRe5GJyTbT0HunbVTUUckZRjWxthqlsjZ8EemkyeQn30zRCeOFOL2Q3DpS4JNLcQsctNkSRiPjJHbv860+pi4qnsSSblYI/r19Y2IWiZMM4The/v/poW1pkSlbSSJXoFqVZvuFWTkZwcA4z6aIxVuwli+UwBJbZ3pTNWRtsRd7qhJ2fHGrdfyFLjJLZ0WklUPE1NShqjlTK0+A67eRgf5n515GXkzldKl+wqya0gWlclH/Rk2GklK+UplQcH1+P20s4ub517l/sUybUqB8UKR3I1gczmnRjJG/AT0V1x3766Mm+NP8AmLVLjK2Ot1tnpq9YowkrVMhIimz5/UksOy/lzk6r9SLfekLFb18mapXk8KSlqqQQoZpZIw0ecZfJ7+nGB8a6WX3NP8fAK66C1Pc3ihnitbpSlWXbIFJw2CM8/GsuPCssuM2M6oK2q5fao8VXPMgcCMLGFO9vVsY+NWfw/fGKfyKn0jRwiOugmgHmiypQ7fMFI4/LWHcHzWmXXVoWk6bWHeJKiGPYhJ385PoM9v8A1rV/E8kkr2NCdfAItlyndqlJInigkYpnYfMB6j9dddY+KVdl8Z3podcaaKOBnWnGO/jSjJPtxqYS/kQkoqJPYKyO70bUjzrsA4Rl7Dv2P5d9LOCi7a39x8bU0+T0Rw2Whefy1cgkA2U+47YjluSfftjXMy+TLluOjNKScqvRTioayS6EfcQxRw7t8YG4EjkYPb/3rdHKlj5LoV7DUcsF1p6Z5IlpHUFSu/nOOML/AH1zJ+TOLcXsnloQiml8Oji5aMBQXGGJx3xqvH5E1LlPpjRdujOVJFDVOqIEcdz2A/M66ak8kbT0FNvQT8OnES+FCoeIebbkZb3HvqjlSp/JtyQjGPt7ANPU1n3c0crNs7h05Yfp741qjGNWYY3ZZqa2ZaiOImTYfIC3ruPGRpmlH9I7ZoOnulLijTiClNTSEscIcHdnJXnudVZfJxOubpl6lHGzpHRfVD3CCalAlApkETI4GIs+mcZ9NJ+l/gi09oD3PqmKskhpUhRPCkLmaH+m7e5H9u+iM8bvRFXpHAYPpfD1NeqvqnqikrJ7jUVstS9D4i+FFHv8iNtB3HABPOPT31035ia9LE9JdlGPBFvnk7MP/EB1lUdTdTQUkyyU9FQqRFEx3BS+Mtx34A1s8LGoQbXyZ/Lyc5KPwjmLUY4b7gywDu8UZOD7YOOfz10UzCWJruptNNQIpjKSM7OTzg47n9ND2FhzpHo+vvF1pJaKiSsRX8Q03iKvjKDyFyw3H4Bz3xk8aqyJOLt0acMHOdJWa679OW66VjVFNRR0JWASVVquEDxSeMX4SA43YKebz47HHcAZYR46s6Hp3pxtf6o7N9Df4Nvp5/EHRX22Wj6jVlp63oKb7gWia2bqaPzlQTKXzKpIHmjHAYHB1pc1F1RZHxMWVe17K3Wn0ApPpb1VFB1r0FHZa6KLdHLaauSegrNgAEkSux748wLHBPKjWPJ4/kuD9PLa/K3/AHElihB3ONfsbv6PXGm+oU10tdBSukqRuYJWRWVXxwzE/h/TXmfK8f0VGUn2bcEoTTUQ5ffpN1DfZoDaayS4XIKplR5hCd3bHbBAAz39dX4c2CH61SEnhm3yi9nc7N01PQ/TSGyVkULX+SPb9vNKs2588Nuxkka5U5KWVzi9G2nwprZza7/TGssrTyVlBT3KaLKxQnGSWPr7cZ/bW5eRftjKjI8T7aMh/sNb7XHUR19sjqIammy1NUeZQwzjb651Y5ZJvlF9CRhXaMFUfy+2U7QQUcbKjB44FG3Y3rjOr483rI/6mfI4xdI6Lb/5fUwQLOJGQRrvQAupyMjGfb20sYcFaYydoLOIrdLLHR0slWa3vAowzAAbiw9gNJutjL7HMa6y3Wg6tkdYpY1ViyI3Azn0+dXwmo47ZncXGTYTprdXWdTdGuAiqwfEUIoJQD3b0PPbXOyTx5JKKQkZSi7TD8n1BoKq11c1XE0teIwvjpOUAY/9J75z6arywnrH8Gn1lTtWwdavqrcJ7YaQVUjxw8iLJIiwMbhqiXj8X+GVw8iaVMHdbdf3bqKjpk//AFUzMAoIywwP39NacUEu+iMnkTmqszP+2T2yUVAELSonh/0x+J/+pfcaf0eSozJ/JXqur6ytkheKIwCQqZ5NvDnGc/On9FVtjWHW6lqaK3LEY3kjfLgSk4Gfj0HxquMOUqvosTdASpkq41URROWm8+OABnjv7euuzinjSfJjNooURr7fV1ED07mU5GY+x/Ua0N451JMnHOUS3T5G9HBSQBQw9fzxqNA5NsIU9NEkQ8I4qWkAV43YRj55z8DXlJKauKX/ALMCTjpEl7S3UMFNQTPNA6gl5oZFZRuyfw/6/GseOWabeVVX+uhW9qypQ1lv/l1ZUwViVsko8GUbdmEycbfUZwO2um1KXFTjVbWzU6q07KdbVVVba680lc7sECPCrbdqjnI98Y5I76jHGGPIucf6/wDgqhqHYHtNaKy3W1bhs8PwwpaM+YBslWI9iBq/KnjnLgWuMklb0ELTTLJUGiU/bSzTrJJJjIwTjA+MajmoJTixoRXRqaKloDPFHup6liWKrjdjaeSfQe2py5MmPHyjeiykugnXXOOgpyaNAkPlDQ/4Gzxj4x378a5mNetTm/cTa7LD22a52+mipv6e6NpG3EBGxwAD+XbTwzRxZLyDOpPQIgepjqpLfKnhSjBy5ztwMj99elhOMoqad2SuS9pdnjmfbUTTpT4wzPJxkegGkyzhjSbFn3bLlnqrba6j7qpihq6QFlYFslj6cDn/ANaVzi48tkKVb+DHdTVQuFelTRbUiGQib9pI9AvzrAqf6l2ZJ23aPAVa0cG54w6LmTew3HIzggaq5RTcX0Tv5LTQiCCOtVVotwBwScknj+/vqtyTfDskqRVENVcBPHUt4dOMMRwZPYA/OnlGlTXYrdsr3eZXuMEk29ogeFUdzjI/TV2GXGEoosUqdMZV1dxurSvCBSKxIjklOFZwO+ffWzxscGuLds045couN7KnQlHV3W8OtypJxEisd8cZUAj1z6jGrfLksMPaylX8o6MnTdFJeIK5YlnUKWjqJ2BjcKuDlR2x3HzriT8qXpOCf/slsZBd4+pbhMLV4zCGLKtCjInDHILZ4J1n9OWKK9T5Euw1JY5ZIf5nLVPRrsPjLHP5we2Scemro5lGHFbZbqjEXCI2W4VMy1TTQLGcbs7ufgfnrpYpRnDaLEuK2DOlryKq7NDNKphd3UeKdqNxyd3pj2+NW5sSiuUFsmElJ1Z88fVI2K49a3Q2OpD0SeVZfFkZN2SDjflnHyoAHsdek8SOSOFLKqf/AD7HKzuLn7OgLbqX+WszTMktLOuFmU4A577Tz3BxkYODjtrTJVsqX5LqdKwXeCSoWVIlHHihu35j11FjKNgBbrX2VlpPHbw4XLqqN5QfcaKUtkJuPRt7V9VbrURrFVdTXGmpgoH2kWSrY4A3kkgc445x21S8cU+jWvLyrTkTW/6z1nQfWlv6m6Nkms93oJ1kWV5XcTY42yAnDKRkEEchvfTcbjT2MvKcMiyY1X/n9z7/ALL/ABF9F/xR/T2nTqexXmyzNmRzDCWgSqQfjp5jxjuDn04bOudPyl4zfJ7R6SGXB5OPl1+H/wC/kDXCttP086XttJ0tQU9PVV6r93URhSWC/iKjPl9PXjOvMuUs0nLI/wBjLqCSx/J2Ch69tTWG1mNoqQqqtI2xckAc5B5XJ9e+sTxyds0KUV2c0+pf1Ck/mdJU21Wtt0QNFuDhgoI4Kn3b51q8bEn+paKZyT38o5gv1FvHSF2eWKqkrZo5TLNHUSGXc+PXJ7841vyYoZvwY3llCdRIq+8XPryne9rSSUkQkBldFI8NxyFz/wBtWxhHA/cTU8n7mYu9famu6ZMjso2tDI3H559TzpZST6Q6wwcvc9nRYesobd03FELea2aPDRSgEFE74JHzrG2+TcXoqnJRXFbLXTf1KSqoY3SEx1ALsXZgVGeMY7nTqTScp7+wQm2jI3asqqi5z3OtuE7GWQhXdc4/L0GMapc+UeKWyjLJrbYA+/qr3d5oHdpImQsc4CfLEajgoRTXZki2+yvK09FWBZIDJGVIXHK7j7HVrpq0xn3QO6e/py3GSWba7NsaIHG7jPGmzO6SQq0Eal2oIfuo18QcbyoPlHrjVcVy0x1SMvenlimFXBHKFDCRmkH4s/P7a6KjS4NiOr0Ojv8ANfmiidxG5/CoIwVHt/fSrFw2x+V6C9vkqKqeSMVAnjZdzSA5A9hj31RJRjuh3Jx0WZUmm2R09RNHVSHyq5GT6EY9vbUQlF3yQnaDdguJZ8VhaskT+nISApyBx/21ZHIsbaXRZBpaK9fJFVU5mWBYpGcojREO3c4DDUxyz5UnoWWW2U7h1AKGCBYZWgapR0CRKAMBsZPx349jrjY8blNye0jHyctmVvV4qrrVEhlYRqF8MDACqDzxrdDHHHj+17LJNtfkMdPdOw9S2ySrpAaSolkOI438hIXzAj05Ge+qsmV4VHm9L7liqSuJarekqikjnhguVKagKFeBptpkUghwBj29tH8ZGWK3F1Y0otKl2CaX6e19PRTzLHGFhOxPDkC9hjYc4OR8++ml5cMjXaf7DQxTpuRWpXqx9nS4FJVNEPGWVjlSp4HznVqjGuXaKVa0aCKkNrRpY3YmWXl41IUtgnv6jI1jyZPUlxv4G3F2Go0j6ivVTTyPIaeNVlUKSBK+OO3oPb51lgnixpR7v/Q0RipaRq5/GraaCSWOFEimWN0BONoHGDxzn01h5RhJxb7G4v8AUVqhZKyqmLLCJWwFlMeQvHHI11IThGK22v3FcrewFcp46wTU1TsKxoWVh23A4GR31sUpxXdr89lUpdpg1a+GGmNE8SxzSq+EHJYD1z/5207lLu9FayNaZg7pUzR26mqIpGiJZwyOMFcHgAf663RjGT4sqm7inHsIW66xU7LFPUO0mzLFu/vzqh47dpBFpaZNdruIo/DkO+mkOVI52DvpVj3rsmUlHszadYsoxExSHfuHlAHPBwNa3gv9yp5L6Og9O1VPX2ovNVRRoT5XIyyj3xrmZYSg6SNKpxtsWSwWrqS31NBJNN9wG3LUoCQnPDAD002LLkwTUq0TCno6Q9wjoukBYTy0cSxxVBk2u/GDk+mRz8aryTlPJyaNMpa4nM7ss1BU0jitSWk3Y+38RQ6AjkY7jgcH99aY8Wmq2ZXoOTdb0fTPTdHLR+Mr7SzAxAKMnkDHHbsdYVgnmytSBzUUHbTc6fqezxx0PipT1YJaSqJlZWBBIbHoRzrJODwTufaLISvoyv1EaroK+qY0sixTU6xo0Z2DOOWB7MPTBx3Gu/4HDJi5S7TNVvjbKvT9hr7tQQC6wfyxTCTEyYLbiCpZgPU5yPz1qnXPli3YquWmYL6qfTyzfT+y0lEaOnmmjt81YlUW2VLOjqBuIIYqd45GOcjPGuvDPOTtr5SOfkxrHo5BaeqEsU9atytqXWlraf7fG4RyKA6sMNtbgFRxj0HPcHTJPIlTqjP8aBcnU9VH4sVJvp4HO1Vk28JnIBO0D2yfjViWthYHqlmMm+UFiSfN3B/XTaQWW+n7V/O7xRUJqEpFqZkiMzjyxhjjcfgdzz6aSclGNhptI+vPo59B4ujYrj/tCtqv9IzCWlaSkD7cZy2HU8FcHGfT9dcfyMvLcdHQxw9PvZveprzSWujgoKJjDHNF4axwjYDgY2r6DjGuNkxXK0bPU4KomS/nn8olkV1eZI2QKzLhkJxz+Z500cfJaBTo6LPeVulimg8Tem5Zg7qBnJGF+SB6jWJKns0Oa0jHXW41FTJ9zNvnogxBaXgduB+fxrXCFLXZn58n+DIwVyLfTyUc5xvzknGRkds8ftq5wco0wjKpm6k60qbh0bTW/diSOqMkhjwA6YwM+v6axpQhJpmhZHOkY659MVNfUPVJV/ZybgSAAxI+PbVsZr7WV5oNS10a2zwvHAX8VfH27vCc4LD047ayTq3oo5JaoieaOWsko4CjnYZn2KMp78+moScY2zO5bpAeW/ySwNBLLE6f8PZK5Usc53e3bjTemou0itt3TMhUVsFTe1eld4zFINwQYDDJJBPx/lraovhsF3o11bDDAieJJ47yYfaCRsU+49NYVbG7MtUXCkp7vFMgV1EmGiyR4nqefT01tjBuDTF1dmlaRbrRTMoakgcYDQMCVJHGAfTWVXFr5CW0BpbS8drkpGnNdV4KlGbA2/4dq9sj11oWROXJ6Qq74lTpekWjp5LeaNIqzJV55BnG7OF+D+WrMsm5ck9DRaWmiK2W97OHhcAVAm2tJFuDNjt+mNLOXN2gihlLdHmral6afxPtQdsgB3ZHOB8jT8NU/kW1G6EorvWXaUSo7GZcSbRgDOeSff8AXUShGCEjLkzRWU4qWSaWSmqZO5CrwBySAffOkUnGtaGa5OjAyXOSomniXMpKnwS74wc8nTvEoSVmRz5NJEVLWstFOC7oMDxUK5IIPH751ZJK6ouU62GOm79DDbq+mQiCoDqvjxtkFe+zHznk6x+Tik5R+UaISSi/uaHoKvjknmkV4xUQIULTeaQgjHCnjaPX11n8mHHHtdsswzttv4I791NJa7zMkhaTxImjlhflXZuRj8xznU4PH9THcV89iyk4tmes90gr62KoJIqI/wCmsW0sW7gEn44P6DW6eNr2/BRCabs0N1uxgpKG21c6TsZQ/iKx2sBx5Txg47599YngjyeSI8pf5maKiqaNo6iWkk3u0yhdjfhAXsB/iH+o1z+E7SkjSpJXQanuNxjkaIQkAfibHBGPxH25Gj+HxS3ZcsvtopPeqQRNU7zHURAtMiLhSM/ix6HWqGBr2fHwYZ5ovdAi4XqLxoFFOiLMx3GQ5A478a0KDrvor52VJFakrJKyCuaOEx7N0MYJPOe/pqH7oKLWxk62jI9dVlVWtFLTJFOm78acGNRz+HW/xUoe2RXkTe0YprjJTXQFZDKQDvdhkDPc66LgnGjM7UtGtpKEXS0+NJL4km4LHEo8hznO4/tjWKS4PRpUdbJqDpez00yTSSzERDJU42M3xpZZ8jVUTxilYaukaXGnipKSNaKcglmKkKw9Bge4Gs0ZOPultDP3aRNa6K72CCCtmCrGWMTQA7nVeCHPpjQpYs03CyVCUfcVrh1hVNXwxTUv3LyMShOQgOfXP662RwxptOiHN2aPpOx0VuWa9XBlNf8A1Pto2/FnvuDdsfB1zfIySn/04dfIy1tg7rPq82iyGolRUaSURlpGBcxtwxUNnkc/lxqcHj+pOk//ANK5zpWDrB1bTT3dkilqbXbxHHHEhjYKWByASMDOPnnOrcuBqG0myIS39jsNHfaK/q9FcYGlRY8wvVoC4QsPw/Of31R4mKcJNp/2OpjnLjT6CtPa7fSTlIWkaAOXzIN3OBxrtRio6JbPlH+JHorqpes7r1HM0lTZallgp2SoDGKHgLEUGMDcOBjknuTrv4EpYU/t/uczLjnKTmla1/T7HEAs8c5RFab/AKcZ/wDWrWr0zM0XaKramqKaaWKSAB8o+Ccke36kar410RVbDIt9z69riKRlqpUBaSSd1iWMHjLO2BkkcdzxqqU1j3ISUq2wOlnqY7lNQLtlnQusn27CUKq53kYPOACeO/66aPvpkr7vo+1Ppxb7j0l9O7dQtXxXpKdQ0NXTo22SNxuQDccnhh7cY9tc7L7pOkdWEWorZR+ocMq21KiOo+1faTsC5DcjIB9O2quCfY03SMDD1H9rbJKmfh95U+Y5U+hI1keNuVIVScVs6L9Puqv51akWfZCtOgjGQADz6fPOsGbD6UvvZPqdAL6jWS6ypG8colgnk3kyjb4SjGCABznGMa04M2OHaLI3P9IO6SkvN+qkoKjCHILh49ipgHBLHvps9R2h+eSXtl0vwHKvp2qoIzRy1AmHBcw4IDAZGD/3+dc9yTldF0OHGiW2TSVk8UEhaR9mGKt5hjjWmXCEeRQ27olrqqKigkIhYVH/AA1j3EmTntk/vrKnJvZTJv5A/wDNlpIWnkjAlXJjw2AT6g45OncOTKn9wJcqyouGJZIY08NMyVEQPly2cn582r0opU2JZFb44rRUSPLAJ4UY+Z2yrZ7Ee+pk5SpJjRTQWo7vXXKqdJKmNYeRLlvMiA53f3GqZQUN0Ly+GWrubZTz7AIbiyHOxYyJG3cf37/loi5v8DSkkrKa3RI6qOm8U0qhxseTzKi+oJHoPTTOFK+yt38jq6ZrbI0KysytJ5vLwFI757jSpcuxXaIKh5DXwwfcFoWbc1QPPtPyR6Z4/XViVxbosT+BIJHo7jPFWSxREYVJlbdtOO/H56HH2polNxYyzW/7G21C1cixVKVBkyMlTk9xkeo1M5W010K/kGdTlLT9vV0cclOamLCKMN585bt341djXqJqXwQ/btdhW0XAVksBeJvFaMeLO58oBHH5YOqJLj0TG7uiunT9BSzUb7lhjdDlyCcYxq6TlKQqhEdWWKCWpqplgWN5CBtJI24Uc/t/npnS1ZPBN3RkIk8W61kcWIxJjwmAO0sOMkDV7/7abFaV10FFvcvTVsp5VpqVJp5J4pGVirFcBQPyOSe/fWeXjxzab7othN4Y2vkL00rXqrtsrxI0IUGTxCAxwOc/px+2s7wSxRkoN0NUp1sntNqC1s8NIX2LK0oCHasR3Ht/zZUftqcjrH7/AP8ARuEYNqg3SCy9ROY5amOKaOMp4gGATz6H31zpY8uLfwK+Mnsp9NWtekrjPDX1ZlVo1ZUSPO9Q2Qf0yOfz1pnL1VyiiF7bs2C2+RpapqLzGWMOzucgKxPmBPqNY1OKS5k8HuiGCH7Wvip5oJpRPHsk3RgoOO7EfHOrHNtOURVGqTOc3y3i2zT1D1ElSgQsitkKCfw8jvxnj411cUvUVVRmlFLpi2e8zXnpuGGD/gRbhJInGCOTkfHbVWSCxZW5fI0JNxoo0NXJWU0+aTa8JKsHBwPbOtDil8jRbadoH22wNa7/ACVElUhAG9lhAYsh7jHpjV2TIp46SFjHjK7N7HZQ1XQlLc5oapTI0hYAR4H4uOx5GubLL7HvaNTp1o9dbbSwIlO83gwyFcZXlucEjnnk6iE21dFMkkq+DR9PUs0NFL95FDJJHkLMygswHrg9uONYcsuUlw+Ro6TsGtcGuDR0zInhzkkxM+B3I/TkavhD03yXaBTb7Mr1X03PTyytFMrSKN8UWcDA9BrrYM3NJtFcov4MtL1RcPtFpZ6kwNCCAky4O48/rrR6ML5JdlXJmul6Z+6o6T+bILhZpEWXw5J85kIwGyDkDnO3XMWZqT9LUi6r76Lsf8wtciW2Kinq6OjUALKwVdxyUI45Ix3PIx86pajkfO6bGUZNqKR0Pp6BWt6yiJocgLtkbJGTydaPGxu22zoxXGNAn6m9X1XQvSqta1NRcqmQpCJBkKwGSfyA5Ou5gwqb2Z8s+K0cZqPqo93+nUtov5ravqyXqGlr0ukkSy0sFIgO5ERRlSGCkrg7h+Wu1BxSjj6jZzm58uaew31Y/S166GvPU9M1ulvf27Pup2/qRy8KQ5OCWPttAx+erc6i054zKnKDpsw15+k1RdLLFcsyR0kdIKmILKrswZdxbaOwyP21kXt0zZhcMlqTp/BymppIaekpHjm8Qzxl3TP4CDjB/wA9FurorlVJph6lnj6Fu3i09SKiZqViCFXhyeAeT5Tj3zg+mpwzuXJroqblJUjUfTL653npSqgoq+rnqbGo2x0vDCAZ7LnsvxnSTwqStdmuGVw76PpupSHqKxhwqTQyJ4yl+QwP+Ej5/wBNciT4WjpR3sxNV0UMOZKdKhnk8YrsyWHoB/YaxwzcndUZ5XJ20aCgvNHaaKIfbj+ZSy7XVQF8FfUYHHp31RllPLKn0VzTlJWaE7Lhb6dp3kMiR+MTGcqwycAk8dtV1Fdo1Y4tNUCK61z2u3S3KMO1MpMZUHuGOQx+B21Xj9zr5Njk1avQ+2zvVZWam8Hcu7cmQG/XUZKjtu2Z7vRdpZYLUz1Cr4oHlIAztBPYH11W1y9o6qKsD3euNz8SopIkeenODTyErxg4YDtnURTi6l8mOeRt0YjqK6xPRws0YWYhnZ0PGB/r341uhB2JklSQNo+oJzHAskciW+ZiuSuQ3GDz2znUyxp9doq5WbLqq15t1LJFAIkmXweZck5GQVPbWXFN8m38Frt9GEslTPZ7o1urUlO9G2h8Ddx7nuO3Gt+RKUecTNu2maulv9HGkM8isrQqNmxfMG7HLfP9tYnFvQW0gZ1FeJK1dyKqRKuckDOc/wCerMcKdljdobTVC190qaGeo8FkUSs7ndvPrqZLilJIXTbVmkaqgjhMZkjEKKP64TYXxggYH5ao3ZaqRneqaWa928XShLTrDKBNGU27FHYk+p/L01qwuMXwkVyTkriQ1Nzp2cyeNklgEi584HfnUqPxRD1sLW6x1XUljppaen2/bTbhI7jLAA5Vc+3r8aplNY5NX2Vyyrj+wcjofsIWeI0807psdSVaNSeCGHGfg+mqlvTMzyzb0Z6avlM0TQpkDI2SrkY7Dga6SahGmdB66CNLUgUda9RGfEIxkcn2JH+WsbyRv2hy+xnZopbR1FBIIGlkeMMYW7+Y8DHrnWjeTF38i/zGQ6qo3NxlLuzIszBoYh+F85Yc47cjW2MdKMfgSSuVGo+nlVNdGmlp6cLUKQjysRgoe2F99UZJeh+p6NEJNbaN5dCyUM9NLRSUsUojWOWFt8kmAck47AHXFeR5JqV3/sLLJKcuL6Alm6WtFguMkEldU19Ydrq0cQ8JCDnaxPPPOrJeRkzx6SRT6fCV8jVVVV1Pe7RUvTxQ0oYbYDKAMRg8ntxnHbWCEcGOVN2aXKTVIw9x6+utpiSzV9vNJNU4jDoQQqZwcEevr+uugvFxzfqwdpFfOS00aSG81dLaGq6muVogzLH597suBz8HGqnCE9RWxpNxV2PsVzp+oYmjqaaOopUPiL4nfI7Y/wDONNOLx9PZEZqRTuNJDNUxloWpYZFZV+0AG0EjHGMenrqI2o/ditplaw9J1lFW1AqbgtTRVIyqSDnwwMDd7f66tyZk4pQjTQqXH50EW6Qp45HqoYlUU8YRWjbJdA3J5+NJ606pvsZr5QYttt+0tDy1Tnw5RuD7wCq84yv9tZpybkuI0Wq2Q0lpp1uME/jxTUkSEhZDuVhjGBnsdWucnBxqmNSWwZcr6sNLUxrSyHxZPCQvyS2DwvwR208MTckUSbrXyZKzU9xjuFbUNAwgUBdhfLREknlfy1tm4OKiUQjK7YJe83JrpJKJAwjB43cY+BraoQUUixJp9mmt1itt4paarulQKvCsWp9pAkB7DIwQB76yznlg3HGq/Jaop7YLrOh5rZJUziaOWhVR4CxTsXG05yN3p/208cykla38icKYMtfUdbFURvc6ispQytLEA5Qbl9ee+Rp3hiv0pMWE3GSbOu9G9aGDpuGeSF54ZJRHHK45YnkZHxrSlFOkdNSc0pAr6hXCovtqQw0TR1K+JArEcIrEFhu7c4Guh43yzNn+D5yutXJR1E0LAo6MVPHrnWxmXZ61dJr1VBUHctPNEhfxiOf/AIj3J9tLdENWjU9K9E9X3Gw1vgdRz0k8aiGioJpiy1GQcoN3C5GAPT8tM5JuqF9P5Zym72e721VWuoqqmSH+kvjRkBfXAP7nRroEkgRqRiSnZRKm8sI8jdt749caZNitH2b0/daastcFsglkaOnhQeOwx4o2+UnHvrzXkOUpWdOHVBeOaipY5IRIizKQfGfPBx298DWSUWnZdo5t1VUGDqWnWlQVDyJzFC4Cn1zye/PbWvDGMoNyZVxc5VFHZo+jami6bWefxKWocbdsyFCAVH+BsHH565zbWTi0bYwr3Jlbp36RUlXaZb1/t8WqaGPbW2Cup5I3qC/KiFs4IUAHPb8taZZpzThHH+zQ78eCgpxnb+UETJT01OqpgQIoCq/c++uJKMm7M2/kz10vcdDVCWIvI0a/1IkHlxnHI9RjV0YOijK66MtcpprjVmoo4XRGQo6sN28HPm/Ma0x4pVIzNKT7M71PaGpbLHLM/wB065QhAcKnuNa8MlLSEnqNspW2aho6CCijjeWWKRZnTxSUmPfgHscaJJttjRarQZpY7lNWpOAy0Tk7DMCI0Pp+XtnVMuKi0+wSd2Dj1N4tWYKoK4EmF8UZZWHG7P76f0/baIbVUOuT0GyNKSednXe8qyABWx7e2oipfKK5fhgyuvz2zxA2JYKobfEEOSpAGBj9dWwx81+xHJptGSt90qq+9yyFHR3j5yOVAHP+WtssSjFUK7bs6dZ0or1Z6eSoq5aaoCiJQnG8j0z+XGuXk5Y5dGhK0hq3qCnvMtuXxTQeH4aiQ4bHrnHf203C4832I5OMqQGgoqWkv5eVvBp6NC9NxkNJwQGU/iGM8a0yk5Y6Xz2RJU6NfabzHKrxNTwUMcj/AHDzIzbdmcN5RnA9PTWRwtX8mBpbTQ+7UsEsaGnpZqKfc1QF3nblceVQR5ht5OTxnUxjJMVS4vRvK202ShpKuaEGOSfj+pwQvvx2HrjXG5Z5yip9I9Bxgrswlxovsllj+9DJLKEIxwB/zfOuxHG8kbqjDJKPyCOp6d6e7W6oiUzViPGtOxPBZW7lT78YGuh46Tg20RKVV9znXXnSV6t/USGCV6mv8TxpURs4Zjxz27+mteDLFxbl0Q007NV9NxfLNff5RWUmypqF8VSRwmB+Fjjg85/XWXy3hlic29GzHLbjI6pb7FVWKcTwCSrqfDIqGlfBIJ8uB6415SeaGROL0vgnio7jsbLerZFL4E0kJBfe6kkvv99x9fjTRhNq0VtoD3Hqn7K8SCCqCP4XCswEePy9SNWRwXGmheVOrMj1pX2S7VS+NK5rWdVdqcZDkduf8Oul4yy44NJaFm0wfbrTXgVMD0RgpzMIaeSZm3ybvX2ORj9taOUNO9/JXKL40zTWKGq6cSRJniJgJhaONgwx689vX01RljGbtFceUAtS1UD2qpqTJKso4VNw2jBzwPYY1TKMnJIflaYNoeoVaF4ZXWUtGMgdgQc4zqxwplKkvlhCi6xmpqQJSRKYwcvsTdkk+uqn46btjLM1qIXi6ghuMvkpSadVDyO6jHPHI9ec6X0uK72SsluugZ1JTS9P20ywwJDSSLlCHDd/f2/LVmDjklTeyySVWzIWjqtrnHT76v7ZabvLIgYEg4B59cZxrXPFxb1dmVTrTYD6Yu5unVRpJqnZBUgASqckZ4P799X5YKGPlXRGOVzph23WSjSpYwSyVscTESCqXGBnHcH4zocpNK9GqlZHd77SW640yqiGmhjZcKff/ER786eMG4uxJSp6IqHr2go2kglaNUyDHHG3lT3LD1z8aZ4XLZWp12U67ofqG/1lRcZU+2tisWhE75Ecffv7ar/isWNKK2yZQk3fwdEs9MLX0ituel+yeaMmnnbP9Uj/ABjPbOe51mxZJTz8k7+/4N2G4qjL0fVVfJ9Rqfpy5SxLEgiWSPfhSdm/OSe5GP2GvUePFRhr5Kszcp0yve7Rbura2oFRSiCczkLInG1WY8Acdu/6acWgT1NYn6KtFf8Aabnj8RZYp0TzbBghmx+HOcY7jjQlYrLMV1mtvTlJU13+7tURKyR4AYjgZB9AF/XUdB2W7HR0v1GTYwP28IVBEzZwAcck9sjUEvRzP6u/SeXopoq+mYS0UjbJUyN8T+hIH+E+/bII1YmhGmjm8MW86uUfkrcqPq76W26S9dK2QohSojpUj8g80uF4z8gH9tedzyj6kkdXArgrNBdOm6ia5zUFW6rUbgWblc9tpz7HSxcJK0WtW6Z9P/wzfw92O+9F2z/aO02WhuEN7iulwuFzkXxpYIpAKeniY/h5RyRkbt3OeNeh8OGBeM5yVt/6HPyc1n4rS/B9LfxK3PoTqvp4f7UXK20ngSMlMZqjEkiumONmSy55K+m3Oud9SUHiW1a3+f6GzwlKEnKcXx6/H9T8zvqvfaj6OXjpVbX1Jb7/AFVSprZaakVpooUBwoy34gx7cDtrj44x8zFPFJNLq+mdPI34jg4tP5LVV1cKlTUSyJ90y+KIQo2k9yAPz7DWD0FFcF0jFPLzk2ZGm69e4+PLTqKdkbaSRgZOOP7nV0/H49szykpbQ22dRydOQNHKd77yqsreUBu/Pr/pqueP1HZmTikG6OuoqikqEmSN4YozFgsOAw52k/Oq5KUWqZD3utHLf5RWUFwlrWAgo2kZIlLbnXGBgj0P566jlGUVH5Kb4O2aa3dUwR0ngIxqoEJ8SJ3JRsc8/qe2sssTbv5LVNPZl+qrolfFBWJtiMed4jGFP6e+tGGDi3EmbTjYJW8ZEUu+afefDdW4CrnIC++ruF/go5InvNx+zWlZZHZ1CKEdcjA5DH59NJCN2hno9SSzwUFVcpYlDTP4oAIyVyc8dwNTacuEWOnxVstWq+09PRvJJOUx/wAGPGWUnnP9sfrpZwcnVE9LsM1V4rbnCzxo/i0uJAgUYcNgkgfnqlY1DT+RLcmbodGUHU30h61vVVETfrZAKmmIcrtGAAMA4zuJ10PDx45Qk38FGeclNJHJLdQ17UlMJhNv8YeNUcqypgZXA9fz9dZm43oMsWkmzWdL9Q/zG51tojuLQU2VjM0IDHAJ/CzfhJGAdJlx8Ep1bM76M1W/W4VsdOzeN4gwrBR2+R760rxeN0dB5JS7No1ypUo4a2VJXkWCKeRwhUFcsNx9vTOs0VJpwstyR1Fr7C9SXyOl/ktxq0BWaJaiKONuNhJ2YPcZxnWuONxw8EUzrkmWqijp+qHpayp3VU4bfHTLIzhiq5QMBjsfXWWDcG0Xwhzd9mw/h5p7t9TOuX6drzNb5Hp6icVG1f6bRoDsBPocgZ1E/DxeRJLlqhMk3jRhLh1hVWmrqmrfHSWBpKaZ1U8lSRwf07/Oub/CJPiqHcqOXXL6j1b1qPu8i7lwfPwT359ddeHiRUaKed9kFZ1Y9ezDxBu2DGcg9u+mXjqL6ElKujQ9L9X2vpFGkvFNNJW1CBoZY3VlK9s49Of11Rl8fJm1jei1L017vk2dL9TbdW9Mb69PGkxJHA27gHvnj15/TGs/8LOM6j+LKXk+5lKTr/7i0REuoEkrREjuABnkn9861Px6kZXKUXRWl6vkSF1VjLGMAlOwyMn9QP8APTLAm7J5SIabrOKKCFd6HIKhR3GD66JYW70NFNo19kv60duiNPUQtLMQdqt5xgnJ9gORrPKNvo2uCx4lNPYMputJIWo/AURssexsNnxOScn3OrHjTTTOe00wDN9Qai9wvDNK4O52SN144479tXR8aMNol8kqZm4rlNJNPRoWKyAuYl9fXjWrgqUiI45S3QQs2y2XamlnnEShlO8HOARxquVyi0kaPQadksXUNVSdSbmqDCiSbJRH5lKA98n3HGmWNOHRMZOLv5HdT9XxyTSx01MlPHwYHH4s+59wRp44/uI5tGbt9YZ63xpHG4cnCg5P5atktUim/cbin+qtwWNaa6wGoo1JyqeU/GT7ZwcfGsD8KN3DTLVOXTOt/SDqgXtLjJcJHqqFWC/dSruPGCFGedZJYeE1x0/sdDxW3fLo5r9frzFb/q5HWW4LBUU1LCXmiP4mGSrfB27RrteHKTx2xPJpZKR7pv6iSXquqZpIPFkADykDCke7e+TwO3fW4pT0a2+dYKnSN7mFKBUGnwJwAEDsT798Ak4GoYy2ckt9nvXVFtE+8vEQyLJM/dgAcD9/76XsY6R0N4f0v6eequM0SpMyuyDa2444IbPz7Y1PRBnr91RF1eamnhmjnV33gOQCwx6n4/uPy1KA5Z1dZBZK6FYhsjnQOI88o3Zh+/bVtujPJH3J/DZ0Dbqr6cWDrzqy7w9OdH0iwwVVbIR4mVQsfBQ53sQMn2HodeZz405Sl+TtePHkkvwdu/iF6U+n9Dcumafp2/tdLnVUTVq0UkOWemAABDKo2tkjytg4B1Q4xSuBonjpOUtNGw+iVH0pQ9OXut6xr7PbbPDT0y1EnURRot67pVOHOCUBAGOcnXb+kRlLHNS2rMX1BrF6dadf7nFv4jOqfpX15elv3Q3UtuuhtuKaSko6JoWA27vEXyqrqS2MrntjJxqPq0MUcceP6m6MmHNOVqT0fK/TvQ0VyFzvl1kkpzUVLszrNsYICdoAHO3jtrp4fpmXJhjOLSVFH8fig3GSbZl2erNbM8NYVUsW3cjC+muK8aSpov29oErUz2sSqkhnhl7tgjI0zxqW/sLbiS/zueYs8hHhwqpSBh3znkHSeikS6d2Q1F6kNUkpciGJEBbkBvXHyRqfRSQrj+dEty6qqLxUeIr+dFEhA7Agcn541XHCoick2/wVaW+tapQsZPiMVdgDkAd+Pz400sfLsI6dRIrrf1plqKKUGUSDOEx5WzkZ/LOmhibqQ9acWQ2i60tJUI9Xu8FWyuB6amWNtaIUFHsszX1+pb8oipnemQqWC4BwBj/PSLEscKbLYrm+ieguKpX1NO6CIbHiBkY8/B+c+mieO0nEZq9FC41T22eVBiOo8yu2Pw858v6Y1MI8kmZ8kWg90T1FOayaNZA0tVGImEnIAHpqjPCl+xGOR0Kx3O9Wnpvqih8QLR3KCONp5VIVU3Zyo9xj199TimoxlGKu0PLCpSi2+jk9VcnssiPS1ss23/8AzqzR59+NXQhy01RmzSrQLgvE9BULVUphhPib/MMndg8fII/TVzgmuMjKmbWn/h7u1Evj3W92q2rHhyom8V155GB66yS82L1GLZ6VeDL+eSRuOmLMlfWUlKld4EH2ahqh+QQKgKSI/wDFwx1PzJ0VKMbhGTpdf6gn6/rK1xs9ts4DikphAxXCnKsRyPnPb01sVL9RknFzft6Oh/wqfRzqTr+2daV8ldR2agsdtaqapqlMniS8+HAAD5Q2D5vTAwDnXM8zLDGku7Ojgg0qD/086L6gputTWV5joqBYS2YJx5sjke/yfy1j5KLXDsvj47jJuXRyj6wV0dkSW1JcBdAZHlMkADYU8jnsdW+Nj5y5tUY/Jioz0fPlTUF5Cdm34I13oxOO5bpEbu07E8jI/wAI1KVFivsgk3jacsdvvq2NNENv5LtPdKmOOOEOTEhJVPQE99VuC7EptkqJ9tCV8QshOWX0zqHs1cIrbFkqSJ1MMjOGj2vk9iRg41CVIGoyeiwban2sc33SI+Ttjxls/J0t7qi3gqux4u8tDLAY3Y+FzhzuG/1xj01HBMWck1TLS3B/tXgWjDSSAbJtxypPf9TzpXjV22LGPtca7BzzbTGG8QSKu3LHjHoBp+NkyqkmWrZc1t9TBURgl0P+IZxqHjbTTFXkRhTIKid62p2vMIhuJA28DJ+NMoqCKpZ03otS2KtikjVjuR87WAOHxqIyTDkpK47JLj09PHS09RIr5byYHOMep9tWLXQrVRuSG263PFXKXwQvo3GeNOoatlcI+4u1YaYPE+MsT8afjfRY9nVPohC1zpJLZFcI6SvWZZhE7ZWZQOQF9/nXC8yTwSWTjaNOPOorglswn16tdZZ/qG9PXNC0z08T5hOV2EHb+uBzrd4OSOTFyiVZGpS7Mj1deqmlnjt9HM1PQwIhEMflDOVBLMB3OfftroLbpkg6l6wuIpHoZql5KOT8SMeAfcabjXRNnT+iOt6O2dKOJYFiNNGDtXcfEwpDck/iYgfHm9McrVMLtHMOpeoq/q26SVlbMSGPkhH4Il7BVHsBxouheyrbKfbVRkN5sjHOP8tK5WMkaz6mr4Vbb6UOzClp41wz78Ejc2D+Z090hZn1T9L/AKhWHrD+Cb/Yu6pI13t19C05VfJ4YQlDx2/EAffWBwxptS+d/wBTXi9WUU8fxo5V9CPqD1X1F9TLnc7nUy3Gaqp5WlmqB+N9wGR7Nkay+TGEcainRask5tuRof4p7NW9X2603SDbHRWmFjVKzHLFnHmA5GRyOdU+B5cMcnifbKc8n5EoqXwh/wBMemalbZcKuumg+0KRhXaXMTDbnPbJxkDHoc6by8+PyHGKdNDvC8LqTB1RBS1sN7pZKiO4nZ4yHb4bZIOCqtzjI9M869p9OnHN4bg/i0ee8lennUovs5vLRTzJJJCJo8nzK6kZP5HXl/UidX1H/Ki3Z6etCNNPTLU0yEhssB+mDqqWaMZUL6mRdrQCq4pKiqDRxtGFPDlQvHpq/nH7ic5fYiez1lYJVYPAjN5coWC/PGp5x+BuTbL1P0TX22N/uZJ0zHkskBbKn1xn40jyxfRasXF3YPl6Vulv8OoMbCPBYtUIU3D8tQssHqx4x4bsrGKmqKxXr5JBExy4p1G/9M8ae2l7RucL3/oRXi3iSd5bPT1T0KEKrVCjcOPUjjTR/wD99lc4NyvH1+RlpFVRTipiXbKfLvPp8ge+pkk1THhcXYQuEf8AM6uOXwWp029+QWI7nnkn/vpUuK7LXTdlWGgvFxrAsdBU1rL/AExshZtw/Qf30XCK26KorJJ7Vm4+nX0V6uvV4SoNlq6GlyX3zRFFx8A99YvI8jHGNJ2PDxskpcuNHV7v0bfvDrbfVW+ppKaOASRNPiOF8csS3OMd8aw4ZxgnK9l0sGS6aOC9dfZ2qtEVNI1RMoBlUx+RG5JwfX89djDyatnFzRfJ2jJJdZmoZad5QInYMVXAzjOOe+OdaWtlTVdH1QfoFKoKQXRII3IJDNI+Tjkkka8j/iD7kjZ/FM0vTH0s/wBnKauq6i4tN9nQiKQwgISviiTIYjj0Guh4flyzZaUas0vM5qOvh/7gCy/SiT6jU0t9qK/wZJZXRQI85CnGT+Zzo87zZYM/pxWjCp5b9p0qy2DqDoejo7bZ6p5LPca9DekRQscsaRsY9w7kB/8APWD14ZscnkVSXTv/AEo6eHypw7Wv2Nu8UFXSTU3geA8iFfERBwCMHvrF60k7Na8+L04s571T9IJ+p6JaOW8LTUcYAjhp6FF2j8wc6vh5Xpvlxt/uJPy1NU0/9DEyfwfWWqG6W9VjNnGUhTH6861f4rkXUTG5Y5bcX/oQV/8AB3QVEifb3aaM8ZZqf0/Q99MvquT5iRcPiI63/wAFtubJqrvXOxzt8CnUAfnuPOiX1bI+oIlcfmLET+C6hhn3fzuuJB/C1KnPx31H+K5q/Qv9Rdp2ol0/wVUsinZdplLDA/3MMQP/AOWl/wAVyfMV/cmsj/lRWX+Bmm3q03UlWQe4WnVf276b/F8nxBCPHlZah/gatak7uoK8j0Pgpkaj/Fs3+VErHkRYh/gc6f8AAZWvFyZyR5yiAj8hjR/iuf8AyoFjdCU38D9j8SNDfbky458if9tH+K5/8qF9Kb1YZX+CXpvCeJebpJGowqkphR7dtJ/ifkfCRd6cvuPj/gu6MQsDPc5t3qJQMf8A9dR/iXlP7f2Kni2E6D+EjoaidZYzdFkRg4xMGxj800kvqHkS02v7DLHxejb130n6araEUdVRrNGMAAwqrY/NQNc1TywfKM2h3LJVX/oBZf4f+iJ3c/yeoZnXBCzOqn24Grl5nkpf9xmeUJSZXg/hi6FjlScdPTtIG9ZpT/Y+mnfn+W9OZHoy/wCWG6D6CdHUk++HpWlLk58SeIsc++TqmefyJreR/wByfQmzRUX0ntNDP49JZKGjmHmMsUG1vzyBxrM/Ul7XJv8AqPHx8i6PhT+Lm5W66fVCnqrYUkpRRJAZ4xhZHSSQMw+OQM+uNe98HwsnhYVHL3Lf7WZrtvZxC71H3Fa7bQmceVfy100W3ZQJ05IZluTSdO0tOVLNG7pvzxtODj3Jzn8h+elrYAtXIGooAlZQJbhTo2SDIoOPz1W0OmWerbhLXXyqeRxI4kK71OQccAj9NP2Uv7HbP4PwvVPVNy6OmdClZELlBA5w00lN5pI1P/M0Jcgevh65Pnx4weVdpUbfF7kvwfV/0J+l9u6f6N8aG3O01VUTSHdHuwA5AAONeT8+blmab6ovyYk3o1P1D+mVb1n0VebJb6GVq6vpzBBEI1BeQ4KgZ9zrP4j/AOvDjt2LDClJOXRS6H+mV76XsphulukpaKRgad9q/wBRgo8bIB4IfI59tXeVilGp5F3a/sy7Oo5oQivhf+Tln8QdPH0x1T0NXwxRpU1EtTTHxkGWTarD8wD/AJ69f/8AE5VlyQXVI4HnYFCCZ1Sj+mlgqViqJenleVo1YyPGM5wD2OdePzTaySSerf8AudGHjRpPiFY/p7YoEXZYKbHcqY0/7azuf5Lf4dfESaHoy3ISBZaSJB6+Ggx/bStx+46wNdISXo2jlgkjejplRgM+VMHUqST0S8UmtlCr+nFrrBmSigmDKVIEmBg/lplPj0yPQb+QfJ9FenKhf94sdHIF43SzOw2+2M6ZeTkT0yP4eS+SqfoV0emCem7LHg7gVizp/wCLz/52T6L/AMyJJfo30uAXNjtuGHAEOR+3bQvJzf5mRwl/mCNJ0RaLcN0FroEAICn7ZMD8uNK8033JjrlHqQ+ex0rAFqClkfsH+2TGmUm1tiylOXbHU1vanceBSQjaO6gLn9tHJpdhGWRdMsrUVwXPhxJjjaSOdSp/lj+rm+5DMtZKjq4ppEIO5CAeNHNivJlfbMlX/TLpy51DTT9P2mZ3OW3RZJPzq6OfKtKT/uUSg32QxfTnp+ibdB0/ZYl3EE/ZqT/lqZZ8j05MTi10aWB4Hj3FVfPYJFwD+usnXyVqn8X/AEAvUVshrqSthkjqVjkeLKU6hXfsAqj1J9vYa6301tZeV9I0LGmlaKfQt06b6YsNupLhcmo46hp5U2Q72z4jf09vfcPLkemddrJ9Ieebz5MiSZhyZEnUEa/p+eg6lpkq6CqYwf8AN5WyfXkca5Hn+Ji8FpRbla7GwReW23VBhbLFuBMryfkAOf21xuSNnoL7lv8AksMqeYVDk8eXjSuSGeCL7ZLD00i8eDUHn/mxpecSV48fyWh0mzEYWdT/APLOkeRFq8b9ywvSyr+NsY//ANj/AP30ry/ZFq8aPyV5enaGFyXqIlbOTl+2hZJNVQr8fGntjBRWunUoauLj/qYk6hSf2I9LEvkqvDbVwDUlgD3QMDprk/gThi+40yW2HA8WVznszH9++mTkHHF92O+8tobczSbiOwLHTe4jjjJF/lxO9kmAJwc540lyJ44/ksr/AC4Bzja2Mjd/70vOXwTxgTJNRpTpKdkauOM4yP76PUn9g9i+SOS7QYAgqofKc447anlkfwLyh8Mie7yzS7VliwefKVx/bTcn8iOS+5DLdFhIaSopgw7Mrc6lbfQnOK7ZAnUiqSi1qbh2UMc51Lv7C+qv8xZgv0koZWnVc+pZQD+udFOhlP8AJ6oqpaiGSKSrVYpQUOKgAlSCCO+pi5J2Ptn5p/XLoi59D32ptN3ZpaulkZ6apQhoammbs6/9WQAw9DnX0vH5cfOwxzR7Wmvscvi8cuLOW1tNI0SVQjIjk43YwN3qBqxFseihjOmGLAhY0fi8bFfYeRnJGRx+h0EfJagoTLR+OuSgO1vg6rloEg30xb6cVnjGYSSRKZREnfj/AM9NKrb2O6SsAV8hkqpXPBLE86cqZtfoN9QofpX9Yejeq6qBqqhtV0gnq6dG2tLTbtsyA+hMbOM/Oqc0Izg4yVoshJwdo/TeTqK32WljpLRBNT0cUspC1MoceGWJTkY82Dyex9NeJ8p4M8+UI8S1eVx7lYW6D+r9F011BHd56SO9vTKTTwGcxpHKOzNgHcB7ajxZY/GyrK1ddCS8uMlSkWOtvrv/ALV2C1W9bZT277KWWZ5YHLGVnyXyCOASeO+ONavL8yHlQjBwqnZXHyVB2myOL6t9BpFJJUfTK0XWplh8FJ7rM1Z4aZywQOvlJPcqQe3trR431GHiQ4YYVf5Enmxzdz2Yu5fUOirKuqnhV6SFpGZYIclYlzwi5GcD51xJuEpuVVZb/HR/JRbr6J5Y1c1EaYJ82V3Y+NLUCP42K7THDrhKqZ2j8d1C93yB/fQ+ERf45PqLBtX9RTTzSKlNPMuPnv7DGm4IR+f9oF6h62W4QpNT0Hhq+ch3cNx/zA6R8eh15t9RIJeqamLc2FZQMFMEn4xzqaj9hH5Uq6RXPVtR9xyqSMOMBgdv6amox3RX/FzvpFqo6nq8Bl8LaeSVbjUaol+XkX2I06juEkasmzaQGIwSR+ek5L7AvLyPZY+6udUQYZ40UnzSP6DHx86X1a+Bn5GV9MQV12hZP94Ta2e6kkDR6qq6F/iMyfZHJNdcOHqPDYHbhUGME8Y/++l/iI/CI9bPL+Yc4rYhg17SHO0sBnH5403r18CyyZouuZQkhrYmM8lTJ9uckAk5GO/HfQ83zRU55b3JjGM0jyFHn7A/iI/PGp9cjlk+7LCXKobHhyMjKO5wB/npuMDSs6XVi2261yXujrKjZUinDOkLElS+MBiR7DPGtuDyV491HsmPmSj2rM90xb36Pqqyrtyq9XUSu7yVGW2yO+5ioP4R2HGrfI82Xk41jmvaiteTKM3NLYc6flrrJFPFSNBEk8zymJIyEQsdzAf/AFEn9dZ8mZZYKEt0PHyZRd0E4erLwxZfERHXniI7T+udZeMCxebPqi0erLzTIWYrIMZ8pIOdHGC+CxebL7CDrO+sVZKhIySeG5Px30v/AE/sT/G5PhEx6j6glAcVSKrHB3HnP551NR+ET/GyGz3O8zIQ9dGzZPLdv89Q4oSXlyZUmuFTGwkeen8QD8ecY/IZOo4lb8l/YuQ3SZthV0JYZPOCCNR6aYv8TIU3AGQFqqmTK85PnJ0KKWiHnm/kgkucTxK8t0UwyHCyFwBn1AIGp40VvNJ9yGNcqORiFuvjkHlElAwP17ajRDyt/wAwktZQK7I08q5G4ATAhhxj8tSlfwJ6nxZEbpRmV0aeokO38MeGwPkjUaWmRzJcW4KZkqRHJns7BmYfA06UQciF6innfbTSKT64BPOlnJR+CLvo8tTSwgQPKIzGoIRBn9/Y6mMo9NE2yg9zpGVmR3cgbmZkO0n2B1Lk10heVjqe60lRIUgVhuTc26MhUPbHOrOS/mZPJvoXx3hjU1CVUwBI2ogxt7Z76VSj0mNykghTeNLTrIkcu3aBgxkkD/X9NS8serJubAnWn03tX1UsctrvVBKJEJ8Go2KkkDkd42JHxkdiONW4fMlglyxv9/yNGMpPs/OTqWzSWi4VFCJPuPs5poC6nKZSRlJB9jjOvoWOfOCl90ao9AQnB1YOHul7J/P4LrEJVjeKnM6A92KnOB+mdU5J+nQjdUS9GTQtPUUlQVEU6bcuAQD3H9/XOrGN8lywWqaHqWnV6Rpomcq0Z43oQQcfp/ppUNRmK6QPUybc7d3+LvqUJ8kSngjvkYxqX9wZ+k1tpaegtFtSogmkqFpYUmD8+cRqGHJ7bge2vms1JybT+TA3Deg39tFNbmWliFIzDfGkCkH8+2pjipW3sS0+kVzDmoRJZC3BEiwsAQe3fH/30ekkTbFolppVkcxKscR8Pe0hBbg+w9QNNDGn8kJsZWxEeG9OoEKDmPBfI/8AP100lFfJDsRbdVVnnCxwxgkhicsy5wAf2zqtuAe5hCKCVoyKianRA54iTIK59Qe/5aq9SCLHbQ3+XS100e2vkiQnd/TTYO/f4Hx+Wk5xIcWyJukKzxJ5HuckqtlVLeYKvf8AXj39tK8tdB6LGUFunLuBGtTxlPEYIXwe5AHGNSssmtKyVDjpkE1GpJl+3TyOSI4XGEJ9z30jnlfSJ4qrCopB4CFI40KgDDe57DvqeOTuxm4paRGqVDhUaDw8csImLE8YGf1P99V8Zp7I5JodS2a4vTykRjcV3KFUdiRgf5/rxqIwfyJYQoOnbqs7PNC8rQ5DM68Aen/nbT+jb2x0mgdeYqyklJqWQRE4fwVztOpeNR7FlJlOVpY5mMKGp25dGTswI7H9v051LS+Ni3L4BQmvks8jJHGYi3EjSbsA+uPnTJQbuRF5RaSS5s4+4jhMfhKBIoYBjk5xg6ZrH3EEpyfuLCRM0X9KaJcDjnnPpp7tEV9itPVVES7GraeGbOH8M5CrnPPHf9tR+5H7sq1F1hRnmNZBJFHwzkkBT7f+9PGDfSIbX3LdNdI1aOAVEjLIgkG2NgpOe+7t+mn4yi6dDqqErJHpX8SFpC8nDb5VAC59s6inehWqI4pQIZWas3O44R5dpPPGMemocWmSurHNdkKeM0SwtnYSpLBx6aWML0gUr3RZhqJah2lhZPABUMxXGfy9Bj500opdEpuxzyfdtJNG7rTjjeSDj37froqTfQN2RzoJWieiafYU2sjAE5984/z01MilWiWdKRsuKaoaoK8P4hycemOBpHjbdtj1H7FJKa1NVReJhaoDcsE5JZFxzyM59Tp/+olQvtQdSttkfkEFMgHYkhsn3Ax86VYW+2NzivgH1T09VOUaqgjOwk0yosatx3JPJOrIwUVRDab7ESpjr1RftwMjIYqqE498ZwP+2jjF7Fux9PcdqNHKVjyuXipxujZT84/851U4x5XElPVMmtVYk0JSgpjGp8q7AAB+XtpJxr9RKk3+kupVSUtsVKtoPF3BcRhufXGff50vOL0ok+5LbInW5VlNFVQqoypXwxwWbPLMPbGeM6SEoRl7h3zkuQPpum7hFOsq1cNPIMHwY5MHJOSSADppZMfS2VKE+7LkbVscigyirrEBUI5bZx7tj9tTyg9NDVJashlaWqk+5qTNI2CrCNWAUD0HGOPUnVkXD4Fd3spUl8raqriEcKvC3lEpqtxwD2wBge/5alpVYqbui31NLI9sual2SMQMQ4l8yHY2MAjjnHrnVUMs3Ne3Vofj9z876aGV+nZQreYqGYkZO3tj45wdfUL+Toxjoy+CSffTDdFu1V8ltqjLExQsjRkj2YEH/PSSjyVEPZDTZMmR30zA7RSwQv0Ot1q8fzKYiOEBSAsQGOPntzpdWWbo45XqFqWIbdk5JPvojsrRP0/Smuv1upgMmapijA/NwP8AXS5XUJP8MVn6eXKimoq9ZZrfJWHzeSGXgMT/AIcnvznGvmTy0tHN4vtjbeHaYzPTwwv5n3iU5AGcZ5wR3/LWf1Z3aHpLsdNbKoz+LBBCsO3zSyTevGAvHJ507yzq2R30SwdLVC0c1VJWNKxA2iAY2jnPH+LSc5ONqxlFrtjpLXSUgWWsrJYaeI7fEYDliRxx+Z1DbfwwquwgsMEbGShpyoxsBcECTGCScj09vnVigntIHJ9Ir1LVbQJJLFCC5Xw8FWJA7/qefz0KD+UGwW6XNYixkiigmyWQEbmYeinHbHP6eujg18ApSXyR0tDdagxxNHulU/jWQt5cHAxjBPPP56ZQ+5FyIqy3VaHwYqmONWUlt5JOCM7c44Bz/fVlUtCt/FlZbQgpyiyRRSiA7ZHkyc9uMfPA0ia+RqjWmT0UteyeC1TDLhmbam3zeu3Pr6aVZH0OoqS7LyzVtFDUBakUgb/hgjxCrZyNpA9j8entqeTSJ4xXbJY+oEmjKTXGqmkTJxToCZGHbt79s++p5b2wuGq2T0/VPhQwmn+9nLwEssp845I28+40ynXQc4/CB9XeiNjTUjThgWXMvmQE/gwDkkc+mlcmuwWSN9FeY0Na7h7fDTKCpiKucc8e/PHv7ai03sHkV/pQjXb7SRIaVoAjkqYyBltvfv3/AE7HTcvsR6rRVa/Qxsjh4QVJxG0XK+xznBHftzpr3ZDzO7RnHo46BGjnrEpNybYDISXdvYD2+dWtpbirKq+7KU1DUWqOMlpSZCdrNDvDAg55xnHfV0HDJt1YrTRPbaSzzU32mVR2XAVkbbn/AJsH1799W+/4IXHonlpzWNDJTV0cFOn4o5GIIA4yAPXj11FvprYVex1XSGaSBnr08KYgghAdo7AbvTOiLu67Br8k7RGVEWIwijOPE8UElyOwyTx+nvqV10AsET1cYMcE6OWKjcCFAHb41S8sY6SJ7I46KpiqcNJCC2d0fiYRiBgMQexGp9V1bQUwjNQziKMK0lRG2DviwM5x2IPProWZSJ4tHqmnr5hGscTblIKRtL6cj3+O2m9RS0HFkNXRjape5SQzgEOisvmG4Dse3ccaFkSIop11tnVoYnhqKieVmVEwDCW9GY8HGO/pqxtP5FcWOslsENcs0tREniIY8FmPm/6eeBj/AC1WmmTFUP8AsLU9eJpLf9xLEi7qnLDtk9z7DH59tU96UhrX2DMdZRVcMqwhI/B25dAAFB7DHbucatUdUHfQPusF7rxTpSoxpQzKpaYIsgz6DGTjn40jUIg1J6NXRNNDQoI0jjlRdz71ypHrj37HWGpN6RbtI9FNUzESTFUC5KllAQ/p3xrTGD7pC22UbvdryvgNTxwrS5KbgRs+O3fjJ76aUIr+UG5fDPUldcE2fcNSKzEEFF3sMDtn0yNLGEFK2iLlWyC72mok80lVVxrvzG9KuFfupHPrgj++m4xXRDT/AJipUVktnhempaVyzkx+L+JlK8+Ze3b10cE++xba6JqW6eD4LGNoS3meF1wMggD885PbtjUenFURyY633hLjJU+BAlRHMT5aaFnJP4WHrz8DjjTe2DuyU3ekfn7QUzQR1lE4LeE0kbD1ypK8j9NfQoyuKZ2IbRm7pQLSGJ1PLjJX21Yn8MJLQOfToVBnpi0tc6h/RVxk40r+w6VnYfqBcY7V0xRUQiMRWnwDxkD8QGR376UdnCZ2LSEk5Onj0Uo1X0jtMl8+qHSlDHgNLc6fJbsAJAxJ/QHWXzJ+ngnL8MWfWj9G7d1Nc65pnqEgaQTGQS08flOXPYf9I/TGvm0pv+VHOSk+whcbnUilCU0NHBKwU5l2k9wfIBx2J478ai210Nxl9yrb7zcDOkEsIqwWbaVAjXBOck8euP8ATUJ7pjRjMjl6tvH27JaLdFQUxwC0i8oDwW98d+2mUqeuhXGfRm7TDdKW4NVy+DJUSbWKGUtHIwJOSO6k5HPrwdK5bVFcYO+zQxXavmgqYg6NU+L4eAGbBJIYA9iQeAe2BpvUklQ7inpMhpLHT1HjIz1QChoi5Y4Zly3Kg8HJI454OotXTCollqaeWB4nYvCAZY3YA4OMnnvnOMfnpeaaqxklRcr5Ki3JTxPU+HPtQqceZRjk8DkkHP76Z5eOrIaX2A9ZDI9SSZjUrHM+JAVCs23lV9SePXHfSuXLTkRqMv2PV9HHSo1WkLmWbDxqj5LkAZ8v5t/Y6rfHsZzW+KEoYGlp56qNQzwYCsWMZc+m1exzo9vwKn90GDQyuyh3jpYni3LEV3M7gZz3+CAfnTNJ/IXsHxWmC03BMicRsviNKF2n8P5kA55A9ce+pqKaTYnT0WKrp2BaNJF3VIkAmXwQSZyQxH5eUE9sDHPPGra4pJBSaszNPcciCna0uhR94qANzO4yMNg4APAz+mqvUdFad/BeobSyVFO8c0UYfa4iY7myMEZHYYwQV9e/IPDJukxunRfgprbPdab7QsXFQyyTnMpHOMBeCORnJzjPxo5L4BO6oE3mxUQrHVKj+oTuQI6kK4CkD2IPPrx+uhykhWldsor4yVC1TUHnXgsz8pgH8IwfTXTlLFJcS/aekLSdQFqaTitYu+I8RNgkjy4OOP8ATnVUljTXHRKtptga5q9VNGYo5aOaMAM1U4WPGfcc5znjTRaivuVtX+C5Q9DUtfLHVXWtoRUBAkLwSv5j34zjntx86SWWfUYsZY18s0NTZ6Opjgp5qiCrKkAO8O/H+ncazLknpVZY66bKDUNmaZYKKGGVmYLsWRdgIH7j8vnV75xWxFxukXbTXJGJYLlVUjqSBHBCwGVBxnBPPOklye0i2Cj/ADOya5UEW9Y4I4IYySvihPKV9D78HHOqovbjQsvwiG1U7NHHBQ/bTJ5pVEY3KQO4Ud/XVc21KmRG30etVuuDNNDK0cEbKeHpmQAk8+v+WhyraLFB3Vl006W8Z+8oKeGoABxGFIfGDnPJwM/nn408ZUvcwpN1Amho5HRpXrmO8RlSFEgVQSCRnkE6l51+43p/LKi9MmFJvtg1RMT4gE5yA2MkAe3wNQ/IkR6dfp2MutslU+BUKjp5VkGd4kYtgLjOcD079tVrPKwlBLthmls9FRTxyRW8SNLwJjHgQsqgA7e3fHHzq5Zl0HGukQVIq2gElDHGrFDIJpwd0Skkk4Hc+Uj9dVuaohQcuiKqe6NSeE1XBHONkvirwVUAjJB9yc6olla+RvTcl2RS28F3guFW1R4SL4RTMbMCAAfbOc/nnTKUm+2PPHGK2LcOnrYUqYo1lkdvMtMjswVfQbfQ8kfqdXcp1oqpXpFqh6dpVoyWppkVU2lWbcFiLAjOT3HPzjRBTe2K+PTLdRa7VR4r5zERFgwkzEkEtjIGeB6Y07T3QrcY+5k1oqbNUTb6SMRl4/FCzAh2Hf8AF6jgcj1Ok0ldhGcbtII23+TyVkMUEcTq5kIaVdw3bsEKPcZz+mq+Sv7j80ukS0RipqpHNTDHIZFmSIQ+GPDO08MOM8Y7/n306aa0Lzm+j8+uounvs/rD1vbfKEhuNYyA8AozM68D/pYa+j+JPnghJfZG/E1Vs5Lf3/3xhggKcAEYOtcdseX2Be7OrBDoP0igFVeBTkBvEOMEZzpJFsA79aaqMVngwnC5C7QQQMDAOe+eP7aUmb0cjkOXY+mdWrSKUar6W9Q1fR/XVnv9LTTVIt1QssscB2s0ZyrgE8AlSRzrJ5WH18MsV02iJRclSPujoH609M/U24vS0klXa62MAx09xUQzy787/DZSVblVyM9ifnXgfJ8LyvFjymk190ZJYpRdyOlyWg0Er/78k0vhBY12ZyQOVOO59j7d8a5ybI5NPTKtPRVMluqJZpzPIhVWhKhZW8rZC5OOCuDjgfro91P7C8pL9wdRU1VMu2qSWmkaFfHhaMgt58NggkhgQWx27AaT3XQluT9xO/SbVE/itUPHUmdY4zEV2grnZtHrlf7g6FGX9R2kyzIqWqnkd5UnfxFCR1B2huQe49iOQRzg99QlJPYWkrstz9QCnuEkVldW8CISF48SbSx3EbW7kZBHuGGiUG+mK2u0D46uqkqKhGil2O6pnZhUcHduJAGcgggfp6aWOFK/uRZLtkq7iiIrrIqYWokQhi4IG5QeR5j3Pr2B1coe6kS6ZCsbVEEkkQRckYZsRtK2TyCBhScqM8gADUpKW0hf3C1s6engnDVMIqPAj34chlaLeSVOO34cnHx20yxlkXx6IKGwRSfcr4zSRxN4khE5TJUMVKkjOCMZAzk+2pUE99ix+5PSXCOJoRUTwJTRT52xLklyASN2RxwPUjk+/LRlS0RyKlyrIqCnZVkU/wBZ0JlzhcZZJEyOAxzgc++ANDVOxrpdAynvr3aeaWGAUlvgqI4IVpsMXQMCxVQO59eRwTpXKLlsROUv2Lc7Us9wqlroHlp3m3SRxcleThtwwcKQx/LJ51FLk2y1RT0wZcLhbv6irQxRxtLv3RRsDMpyEBIO4eYHBPyD8S+NdaKqUkD7hfa2Cjjpp6WN0CrMu6HlV5IBHByPMSG9BnnVijUdkOTqkiGjqqmuvk7pCimJWDx1MSsAM5OTx3UrjHfGT6ae+nREYylJ0gJaaq6x1SwVUsUr5UGUHBUkcDtgkDJ9tXSikhFKV6J7hHd71QJNSVaw28ytE+47ZSM/OCCcfHY6sgox+BpcpLsrVXT891t0cM1ZURR8kyJIvhspzhuMnPcc/wCR06ywh0heLa2Vem4LZaoJ6Sor6ur3yeGisVDIoxljnJydvcc9xnnUT8mT0tDJxWmGv9ximjgWjp4YZVCCoKuFB9clTx+ZP99UuVvbJdL4KVP0vbIz95TrTLUMzBamGYgh8nOATgntx3AOk5uv1WTa+C7R2SpWKD7mshqZZHy0kNMjEIPUnHcYPAOlnKFaY8Yu9sKVFwp5qWlWeeabwgIm3xBJHIGQSoGMntgazOKk9aLpfYksnUrLSmoitMcbbSkecb93Plwex5APPfViwQXbIjO91sI019uEtZ9qsAO1iGjyAyjBY7dwwTkf3wNPHBBDvLK6SKtyudbZJ1jf7aEht3isxOOSTjj0HI5yfTUT43xSF5OBBLXVaVoqRMKimEG6Jhk5ZivH5gEcDP8AbUrEn2HKd2Om6geSOmqY65aWRP6oSYkFcrtPfk8HOBg5HbVM8cbST2K5Nr7D6eGuqXEjNHPUruBWLJbjvyO5x+2s0cKcv1DuNKxz22qliAUtLGuHRllwT5u2T7YyffGtKhhxvuxFGVWFaiyStcMhvEMeQZ+TuIzwBxx650c8adFjWyGShKU9LF9v4s1RIzSMqbvDAxkH8zz/AJ6X1IJa7B7dJaM3EldJVKJIGCmpBii8DAZcBQQ3fIAxg8c8adTTrZVx/Afiqam0VBE9LJEylikE4YvuOGWPAODkarc3FlySUt9InobuKmGqKLEGWFPGpwgVkG7zFecYHBx6g50ym6tFU8ik7qiKpoIKGan+9ipjSBzTVFPsDBVYqysT3wGOT+Xtpla7K5NR1IgrJksMDVdHF/SVBE5yN4Ck7dg9RjVdWS58VdGoo71DK8UgoPCURFBIyhAvl/4mOcAkscjj09dSpL4RY3q0gLTXWqllqaaaKWmE7FzG0JV1AI2pHkkMG4OT27451FNaZVzb0fI31Lhmpf4nrulZEwerMBdCOX8SmQD4OeOf117/AOl2/Fjf/Nm7BbWzinX9EKW+TeXaGPb5Hf8Avrrw7Lp/gyzLjVrER076HoI7w9VIhaOEZ49WJCgfH4s5+NUyLoFD6sVyVHVFSsUheOMkkkYOf/eoRXPs59q4g2XTtfW2qgVIfDp4j52mkBySR/21mk7ZohcVo3HRd8kkET09wJKRuk+7CN2wUU/4geODjURWx5vlE7h/C79cqmvv9T091NNJVXCnimqKGrl8zSxImWhYj1VVJB5JGV9teZ+r+HGFeRjVff8A8M5GWKj7kdrrepbfSS22GNmGIwUkB3DDO3kYKOSpwMYzuONeQ5OlRl5paLs3VcdMqLtnqHkHiI+8MIgSRz2xt2A459f1e38BzaIJ7mlyt0ckNXLRSPG24HD8IfJIq8fIwRkZ9ckaW7qiFO+mQ008MaSpTl6vuZZanc5jyMbccEdxzgYY6ja/SF6pAuqvbUdQ5VWeSdRO6NT4Ktyq5PB8TyMcdvw++nqSXIRt/Yv1fVV1ilqoBEGramiplMaqzeICSQxBGQT+EcDt69zbbb0x7auI49RXGOelqvt0SraGVgrNjBUZfc2fO5Gdo4OVwOcalKVp/JPufwQx9U1NPDGU8Skkklkp2+6TCxAc8t6EenAyMc+UjQnT0LbrZbqOq4LfRsyuZwsZELR8PJgqQhyexf0GOx582nu+h7SWi9Jc4XSAMWjhlG+OaOTJj7Fdhzgc7hg57HPbUppJJdEt0tjqkJU2irLCJYnZHbEoKrhskjI7ttxj3ORwDpWlv8kumijWw2+uudwjmllFbSMojmGVVgy+bfjOMgbiO2Memh1Em1K2+wZb6m3x1lMlBVxoIF8OFXOAzhcsrMAcldzMM9sDnjSvvRMZpJV8Aq63uFac0lLLIktLI7yrIXRk8vlUE5BGNpz2O4jSvpRKnP7F6juE8yf0p2paR/6s6Rxf1F3eVmU9nG3B5x685GrIyuI8Zu/bovXlaSG1ikhnWaqbEdQ4BE8mZOURgxO3Z2APfA/xcu3aqx24rSYKqbY0T0yW9Enm3Nslgm3QlEYFmYE85yo24zhiOwI0L7IRNPfyZ6hudJTrFWSU0u9iFd8YZDggkKTnIwOf8tbHhf3M0eK2wrLVp92vjP8A7v4B3IoXO5iSqkgHcMHn3I0jwvasstNlKoW+28UskUaVgVHWommCALHgEY9Sw4P644zqPTjBXIsim0WaezV9ZBSz1gp5aZ/6bNG4ZwByGDY8vdQPYZ0nGBHf6iOWB6dfEpw9XNBKi+GsR2lcluPTGTwO3cZGk9JO9kct2EbQsNe0VJNRpvd3ljaZshGB2sufyAPvn8tVvEl8jxlH9NBCkoamSthkR6eeRISqRwxFccMFznAPmI79z+Wl4Rf6iyMmpqS7Bd3pKyrkEXjeK8U4CtSN5yCFIz2GAePTHroc1BUkK3ObbsuRW6mVZFlWqSF2MGHTkqD+Idzw3xyNUyzSWqFUb7GVMtDS3YBDWJEIiN0UgyhAGwc/OBn5Gro5JvYsmrpBGGkhhmLS1zTyzIc75AwYlgV5x6c86JbQ6mrthyntUMEInndDSwv4e9pQQxxg5B7Dtz+p1TKUvuXudf0JKaK2M8tTFKplhjEjoIvNsPCjB/8Aq7fHbVXFyfZZHJBe5iM0NsWaS3JIJ1KNvKllOQc4PGMeuMj350Si/wCUJTg0uEa/qV6KrmiaaamMrqfMSyeKoBH+H1Xk8988aX0/ixF25JaJpmmd6F8ssaMxnk8QKrHO1V2nuR8dv11KxxSoi2tsrWirmo95qZhTU0hdA04/4m4YAxge/Ykf6asgoqylSaZoPtpbTFM9dU/cmRMhCwLplSA49wBt5xnjRUE26GcpK02VaWirULpb6xp7gp8WMTtiMyMoXDKScH4z2I1Mcjb0iv3dJgKrtV0prvJ4kC4LnwmOPOQAxwM8j0wfQ8aqcpPQrhK9ktZSVVdao5JpVC1DKiqwG4puAYsOwGAcE9+NLylpN9jOMpIsrSxRI9E9AphhqZTFVSqVUSEgAsPTdg9h659dWKUv0shwpN1pFitqKG2wzF4C7xESQSzKMny5KBc5H+uRwNaoR5dA/bplqm6zhqKhq+novvqKnMZ/mcJHi+JMWUIFb/GuxSVb/mGfbWmWNpkKS/Uj47+u033P8SD1YXz1FNSSZSMpkhGUEDP/AEjtr2H0u149P4bN+B3E4j9TqoVd8ebZsMztLjHI3HOP3zrrrs0TsxjHJ1aIjvP0Boo6fpu518kZfPiKO2MKpb19yAP11XLsujpHKesZvFuNQ/8Ailct+51H8xXIzIGTjVpBrLlVyTWxN7vFEQEWMD0UAAnWP5NMn7QBRXKotdSJIHK4PI9DrTSaM6dG1k6take3dS2moFHdKWTDIG53Ae3cqwJB+CdUZcUcsHjmrTKppSVP5Pv3pm60fUdits9Mnh1VTTx1aRqyeHDwGeM5GAQpIwcHj37/AC2WLhNwXw6ObSsrXGoo7vFWSUcYoJpqaMpWSZDKpCMY2AUgsBk7uxxgeumnxq2K+LutDaKkjjqzWlnFU8vh/wC94WHaPKCE5UBj5s+4IOhKnpDpOtHqgzWmlMq1BTxqYBWpzkMUyHLE/wCJgRuX3zzqV1ZDuK7NF/KLeorpJ1BS4p4aTpHtcjKkMCTliSGHBwQMfOnSTLUluP3EunUENPQyXGWjEvjTxO9SyMoJRNqKG7AKAAO2D+ei3XIS/bbM/wBY9YzUsx+4p4FjmhUyRRkHAUkJhlJy21sn1BB00rb2S246aKNhuFesTVlVaI1p1lEVS4A/psrqQACeDjcfTPPqRpWqToWKbdgeLqGK5GOkjgjKQOXeZYzIzBnU8tnPBKsM8AMRyGxqab3ITekkOk6/ax3Sq2QtSwyybzTE71RkJU9iQcFxzxwcn21ZV7XRLddBvp7rApMtLX2xkt0pZTP4PEjFgrFc5DKNnbA5LD3w2ki6FP8AXqxl1usVwrY0ppI456iQLN9yzRjwwCqecHKtsJIBOcce2am72yttaSK1NBNZKqGnlWCqNEq+OYWyGdNxDIisrEDlCCeckkgkDS8Y6SZLqPeyG41M8i1tXNK1dEJKWKohEh8VQ+5ViYHkjO3B5wMZzp+DmrYklLbRWrbHU0U9RR1CrR1lHs8SJEcqmByBgNj8ZJGOdhA+Vlj7sFF3xa2i5HDW2m6rLUVj0vjeItLLBAHNQOSduWXCjLc+hX2AGoim9seuLt6KUtNXWUMTdvGM8s6BJYtkjou0+UenCnuMZXHPOrf5dCuHF6Yi/SO5NQzGlu7HfKZTNJmTzAE54IAHJP6+urP4p/YFjddlr/YKtqKygepun41DxmAsGlKAhSoPBOTnLH31S8/dF3F65GmnLWe2vb6GeI3GRWkj+6cFHIwAhwcDJ9+/zgaSMlN02WOTjFpdkLVbVtNTQTMIUl3B6dYVzCMLvYEYzzvwDnHv21MeEXa2UuTlHegbU2O7ymKl+5SaUozJIh24CncreGCN3Kg+ncn01e86T0R6erbtlO1fTquiZJmuMzMzJIKKEB2EjY3YZccDk49z+eq5+SpaSF9L+Zhe09O3Sghlmp6ueSBclKd14ZmdicnuMe34fXWaU5S6LIQk02HLXYrsFnqEKVDBMS7lCBVI4JxghsnHqO+kak9seONp7ZFNavsbxunNcjSohMZkLLkITyMHGcEEnGONQm7GlipsZZbaV/3pKKFqWpdowsco3BN3LbP+kKT+f76hN9sq4fNF+ahtEEtGKSNx4O9YZWB24LHOPXJyBgju2jne2PwgnXRRpKVHiqzNSCppzGBPChaNWcjse53BQe3GONQt9oEqttHv5TMsNwuZqJ6arkP9MSMG8ikHw2Y8jAyvHoT20vuq2S4pXo0NnpjE1K8lZElEsDCZ0my8gOwj0zkZPIz21eptRT/BcljVJ9CQXBfvQlOZoxEqIiSAuQCTsGfThuR6HPoNVuUm9FTkk/b0NS8COc0dRDBLFE2FqmAXYWwqKoODg7f1yP0TbCOZR1JWStW0EZnrKh/GWF38SJiWICd2XcOeQME59vXluHL5E5xJmro5KOBY5aaes8VY1qXCqs6bdwBByOQTzgY9Ox07hatA8iZVF0p6edJKeSDxSx2mqI4Tnedy9xnAye5YdhjUrW0Q5K1QNul2mrKNXjq4UqlfKFGJWNzxgk/4RvPJ5GMd9R7dEuXH9x9NV+DUJFLUQqYZhmJVUxTAY2DGcgYznnnPtqmldsrjN3VhhIFv0HkjxEZfGepghZknK5b1OSQCCWIGCfQasX4LHb0Zqt6dp6nqOJ5JTUNCBUy0u8rEsqICMjOfnAJB2Edzq6OVx6KZNOWxgtCUdpDvUNU0s7rVlYiWUuVHIJ5A8jZHcHVj8iT0kJXyfNH19udsP146er6WRmoza4GaYMJPE2yzjcMfGBhueNe3+kycsLtfP/o3+N+g4Z9Y6V6XrKrJj2QSu0kRC7QVJzwPTknXbj2a2YQ6sFPof6bSrZPorUVEkTP9zHUSEMDgAOFU/nnP7aT+Ys/lOK1im6V8jjJ8xOTqtPbKJPYFlj8E4PfVydjJlo3OaaGOJyXVBgar4Ifk6pjDEZYyyqWwD29NMlRW+zR/Svoap+pHXdo6fpkZlq5h47KQNkK+aRsngYUHGfUgeusnm5142CWT7dfuRLrR+kNmiFtsj26iZKUmmjiqYYYiZCI32sgyMnDMMtgDGMZwM/M5ZG+TXbKJY+Cr9rGW+gaiqYI45xVUklUJnWPa6NKpZipwvoWwzDyqCBwe0ObbSZU4xvTtFi20FOTBOJ3BhladJ6mVVJOWcHkchcc4AzsPzqIummittrTEWrt1z6fQUkKUrwSlpsjxZ5vMQ0pj9FIYY4Awc8+kuSUaQ0pKkqK0lxNTaanwKKaRkVFjkZy/26+JnaGPI5IAx8986nkPHJF2qMcJ7pT2+qgrLdPRV0TytVQ1BKqVEnDKuQCRyxIOMFTyDzKnUmjPJSS2thSngrXuNM1SPGq4acMtVVMT4kGSdxhBwSykgkE44JGew8qpJl6vlUt/n/6LFZZqOzQ/cwyrVUldJG8dC0DEKVZ+FYZO3AGCx5yRjOoUl2iOMYbStMGUXR1NVvVNbqlaWrgK10dLEpMbhQECyKo353kjIP8AiORjV3NtUxZQ4Sav/wBFe4dNVN7mejpWjirZoWysXkRXRHDYIGGGfNuDEkpgnvpVJfKGnFaUfsVZemb1090hK11g+6q6iMbac4ZoMEuXI/Cq7XO7sc4bupwclJ2Vu1DYbounIFjoIqtIzJRA0pq5gSpjYKOB68uAG7ggD14RzkuyUuJahoYEr1uFYYlmSfwn8WCVI+EZtrJGc7Tx5fcdwMk2RkpbfwRBxVuTBV96ZuS1VRHLV0dLTVyRkUyAiKNkHDZbzAjJbJOQD+gZ5F1QdJ7pMqW+0rFTss9bJEsammZ6ZkAdwrAtICCMbWZgex+OQa+e9ohOK22Vz0jUwVtWlzlplnaF6emioCC0khZcb3YEjuCoHPAAPfULJUqJjFX7zVWiO81tE86ypUU1CBEZ6qAIojLhCgLAMi4ODg+UkHGWJI253rolJ8aroo2XpqvL1c0VzId8GSIuSmzJ4CgYBxu5HfAz8zxbHcGlprZHU2SpoqmRqioSnpCSsccgKMVjcMSp77Tx+mdLKPEoabe2eNzdHp6eqrNjT+aCqKhC644I7kKxyoLc8caFFIbnWmy9R3S2TWyQxVMlYGkcpI/lUgKMI3HP4kHH6kaGm9UOpKO47QWN7akFQZad7ZFFkL4kRBA8PJQ4znBOD6e2fWXF29dFinGq6LcVXT0sJq6hmNFMBJHHBDlg+89242j8IORwM6VRGi0k4y0CKeWVBXSGriEhmRQiNlVww/Ao/ESuSADjI7gnUrHoqjJ7bYR+8jlVblRwlxjzF12MUx+IoTz+Eep/EDqXCMfkdzctxRVpLzcbdC1U7NRbl8Yqq4bzMoQs3HcHLDjGCffQo0yFlabvX/PkvXKWooEUoEuslNJiWeNhtJJG9yO7KQV98fmNVSS67JnJrp2/+WCqk19wWnlZB99EpqIqWQFhjLDeP8KMrEDn/pPfUJ0ij3S2uynVXq8Wint0c/2YmiKxyPLKo8QknerE8EgEY9eQPbQnugfNRNK1TLcaSh8STw6MMqicjEZl2lidg82MbPMRjOO+caJbVl0YOVNPX5BX3NeKdBT0sCQyzMwqERn3up/qEFR+I5jwoGd3fAzqKuNGZ8q/BLIl9+0pIoI6qlihLlg4wrYY7sE/hYjb+PHOPQnUJUQ1k/SxzR1rVcciUUkVXVTGGKEt4aKCu5SS3/DKsdufTAz76ZtA4uwNR014t9w+zqHaqp6ibKsF3EAHGO/DbRjGfNkd8cNyi1Qm1pjp7JV3bxKa21aMJ9sdME/pspUZIDE4GA/pknGptJJGiONTbaLsNNLb4H+4rIJlCGCoRAyhV2AMVxnA57dsEYIJA1XbT6I41dgaC62+qvJt1WslR4rpMKt9qxJTugaNix5K72BI74Jzpl1snlBy4ysOQ3GjoqKtlheGlNPEIGZHWREaRWQOOCP6nmOMkBSQTqFFPsOSi2ooLWCpENNQtLUxxVLrIfCL4lMW8kFhuyASGzt4A2YGdQ2ouwu/dJgqO63K59OSRvUTXJIYv6EJV0WTzltiscFSSCcN79zjSW5Kv7EqMpx70jBfW/p2sv308+0tM09FdrksLwUdNucTyDGYmYgeHktj/wCXBA11Ppko+N5Mck1a/wBrGUEpJnx9QW+5UnVi0F0pKmiuUIaCWlqIisqNyNpU4IPOvo8Zqfuj0b4jvqp1L/tNDQRvCFqrZmmMgGCyHsCMf4Sp/wD5aeOmWM5/S0zVlXDTxlQ8jqi7jgZJxyfbVgqPpDrOWKyfSWltFvn8Snp6eNfE/CJPOMtj5O4/qdVFj6OD0FQI2qV3BWI4/vpV0Z5dguuk8aY8fGdWxVDpleMEkAalkktTKJFQKoG0Yzjk/noFR9jfwwdAVHQXSJvtTRU81wviQzhKiD+rFTK7MixPzgvty3ByNo7jXgPrHm+tm9Ffpj/uZczknTWj6QTqG31FPQLWVDtHWSeA4pwR4jK2SpLd48bUzjGR25OuBapkReN/qYRo79SpBVTMI6iseVo6SmwFEYLmNSPZSnt2HfB0l1ciOS2k9lbxaeW+fbwUSyRvC0QiAXb4rAuQGA4JY4I4yQfy017ByUmuX7FSmulr3wRw14oo1/pYeRTKz7ihBXPcEldw4GQM88rfFfgXlBK0Ca69xUNZBDS1U1N4NQROlwwJqdQ22MAABRtJOffd6ehbk6Dko/pdAug60pembtRAyLRRTnEk0Gz7gw4LMqlshQrc4OcB8cjVkbvk/wD7EWVJ02G5qteoaqOWnj8H7pYQbpPIkMEAcjcT5QUXarDb5gSWxgnUcW3rs1Qx+q16b/dvpf8AEV7lTyUdXPEkxuL0KPEpps+dthG0qTwvD9vNkg41CTToplUG920SUXUkUMkdYq+BTTeFG5nUKsS4UY3N5iCQxPIHl0K3tA8vyxt8uUhEc1ekTRmF5YZvEEayEYz+HDAEtu3A+wxgasVvQSyNpJlT717jQkTk25PuWRVBaVVKliUI/EjbTkHJGJFz30/FxVvootVvpgyo6jkr5YqVzT0UTFZkpQRKhhfdh+cE8gKwA3D99TUUm/gr58k0BJqqsa3yABp68Sosu2RAsvh8l93GDghdh7n5wNTFR0kxU29FG39dVEU9ZW1dNJO1ukWiaJ0dGkVQSQygDJLeGMjuAD+Wt8Za+5Cvv7E0VfT1taY2ma2RVcTVMKrTl0kUu6qEAUjauxwAwxknjHOqePBc3sEiLqTqW61ktBUyyvCI4F+3eJcy7RtRY4wMBlTBOBliM+x1bGEZK2uhpSdprQKg6kqpLMixwstwiZZHjkVlSONnyzqSeQ53AAcnbk99R6cVO5dD+p7aR06MmOzRxg+WeKmaTPJYswDc9xkemoilxTC2kQXKslmqbVE7Ax7VO3aOc7Qc++R31SNP4AF8maS1XGuO37tZZVWVVAKhVQqBjsAWPHbnSxb5Mpi7WxWqpbZSUstOwV2eSlJZQ48LxIG24bIHIGff11bD5J5NUaitkeSutoeR3D1sULbmJyhiLFfy3AH89VXuh70v3CCu089bRSMXp4HimjUnkOxG4lu57DueNUrSNsttr7BeOxUNvqGqaeARTRSBkYMcAqCBxnHYnVsSlRVKXyCK2sniu9FRpKy0ssdVM8QPlLqi7T8YyRx6caSbdsq+UWavFvs9H4CqglmdHUqGDKAAAQcj1Ok+Ea+MVeiFZGpGgSE+Gi1CuEH4cgEA47HgDU223f3Kcy4uNAmluFRJ0fHUGU+OtLvEgwCCYVJP7k6T+aitusaohepcXKgYrG7GOKUl41bztuVjyO5CqD+Q1EtOkK31/wA+TS9eRJa7taKGlURUlZZYJ54gMhnxKc89uQO2NW5Ek4/sWZUo1X2KtLXTzWiljaVgk00iuqeUYCsRjHblR2+fc6XtbLZvpGotlXMOmKmfxXMqxMcsc7s7gdwPDdh3zqY/pY8G3bZV6jY01HS+ETH4sbo4XjKsEJH5ZZj+ukm6qinJpaMp1ZcaihuwFNJ4Aan7RqFxhEIxjtyc5H+mkW1K/gryNpJoOyUMFPXQiKMRrIEjZU4BDFg3Hucd+/f3OrJa/wBC1rdDlt1Lc56CKqp45ozUxQlWUYZGqY0ZT7gqcHPfj2Gox+6aT61/uDSdX/zoA/yO3ia4VApIhLRSQrT+XyxjtgL2xjjt2AHYDTxbblZTJVkdfBeprbTSmxzNCpkaOWoJ/wD+m3v/AP3fHtuOqZajaLEk2Zjp6ulrerOo6ecRzRW2RDSB4lJh7EgHGcHJyDwfXOkxJbRUm3Jr7BC2SPW2CPx2aQTCBpBnAberM2R7Ejt2HYcHVjSUnRfSor9PFq21T3Cd3lraeV6WOdmO7wvFKbD/AMw2qBzn37nVl0gTcoW/g4h9RrZSP/EBe1+2jVaS2UIgCKFEfl9Ma999G34ab+7N8VT0c++sEEQv3RqiJFDyKHKqAX83qfX9ddef6ZFmTo4Tav8A9yp//kNaJCHe/qPRwwdDRiNAoT7ZRg+m0/vpEO+jhCsRcEOecZ/sdQv0lUuhk/cL6Hvq0SPZZsKg1kGRz9xDz/8AXqqb0/2Y0ixLSQyUdxmZAZEmgVW9gwfcP1wNKpOl+zIT0fav0uq5bv1BTU9U2+np/DihiUBFjTew2qFwAMe3bn31828mK5dfJhUm5bD3Vt5rbf0uJaepeOT/AHtd2c8JN5Rz7bj/AG9hrn8U5JMrcmmi3WVMorLQodgrxSEqDgZAcjA9MH20JKhLdo0V4LQdA9OTxO8Uwngh8SNyrbZHIfJHckevf50vyzRFKUVf4K0FtpqyGVZ4vFAihrV3knbNJtV3X2yD6cZ57860S6KeKfZmeqK6apmqhK4f7aBxESoyoJ5GfnA76r4pWTlSVJEt3kFd1Tbqeojhlp1EkIhaFSm0RRgArjBxnv3/AG1fHpv7f+h+8iv/AJoqV5JnraFiXpYKl0jjc7go3n3/ACBz3zz31kk3yQj/AEJjup6+otHWfUX2spQvAszF/Pl3L7mO7PPYj2IBGCBrTxT7XZTLTdHhcamvmv8AFUTNIlPbp3jzwQVeoVeRycBVAz7aIpKaS6/+hU7exnWk0lntUH2jtEC6uQWLAnCHJBz6sf31biWkvyPLVotQTyNEFLsRKZ3fJ/ESxXn9P/ONO4qmKtxojvMrW2wyU9MfBjp6pfDKjzgF1BG7uQQzcE45z31GJJt2Wv29Fe0VU1S4qJJGad2pEaTOCQ1SgYZ+QNUQ/XXx/wDpEFfJsm6NxUXqISgS/wC6Qy5cbiWeVUcknvlQBj4Htplp0hcbfQL6oLJCjB3ylX4S5YkKqNMFAHpgHHGrl+toj4o0NRxDbKXvAq0+1W5I8pHB75x66y49yf8AUef6qKkltpoXDRx+G009RHIyEglROQFyOw+Nbsi/6jXwWwS5L+p//9k=", @@ -674,12 +692,12 @@ { "data": { "text/plain": [ - "[{'score': 0.4759259521961212, 'label': 'flat-coated retriever'},\n", - " {'score': 0.10909564793109894, 'label': 'Labrador retriever'},\n", - " {'score': 0.08196048438549042, 'label': 'Great Dane'}]" + "[{'score': 0.4759257435798645, 'label': 'flat-coated retriever'},\n", + " {'score': 0.10909581184387207, 'label': 'Labrador retriever'},\n", + " {'score': 0.0819605216383934, 'label': 'Great Dane'}]" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -716,33 +734,33 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "id": "1cd9423b-3cba-4cf9-b58a-bffbde08738d", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:48.888614Z", - "iopub.status.busy": "2023-03-12T19:47:48.888417Z", - "iopub.status.idle": "2023-03-12T19:47:52.868441Z", - "shell.execute_reply": "2023-03-12T19:47:52.867833Z", - "shell.execute_reply.started": "2023-03-12T19:47:48.888595Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ - "[{'score': 0.0960492491722107,\n", + "[{'score': 0.09604837000370026,\n", " 'token': 4827,\n", " 'token_str': 'fashion',\n", " 'sequence': 'i am a fashion model'},\n", - " {'score': 0.09326528012752533,\n", + " {'score': 0.09326566755771637,\n", " 'token': 2535,\n", " 'token_str': 'role',\n", " 'sequence': 'i am a role model'}]" ] }, - "execution_count": 14, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -789,29 +807,29 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "id": "5d9cedee-902c-41b2-b6ef-1c04bbd8498e", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:52.869399Z", - "iopub.status.busy": "2023-03-12T19:47:52.869151Z", - "iopub.status.idle": "2023-03-12T19:47:53.395475Z", - "shell.execute_reply": "2023-03-12T19:47:53.394997Z", - "shell.execute_reply.started": "2023-03-12T19:47:52.869376Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ - "{'score': 0.5305245518684387,\n", + "{'score': 0.5305243730545044,\n", " 'start': 12,\n", " 'end': 75,\n", " 'answer': 'an open source toolkit for deep learning inference optimization'}" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -845,26 +863,28 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "id": "755096b0-406d-4167-8642-5b4b093e2bc5", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:47:53.396122Z", - "iopub.status.busy": "2023-03-12T19:47:53.395987Z", - "iopub.status.idle": "2023-03-12T19:48:00.052492Z", - "shell.execute_reply": "2023-03-12T19:48:00.052125Z", - "shell.execute_reply.started": "2023-03-12T19:47:53.396109Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the encoder to CPU ...\n", + "Compiling the decoder to CPU ...\n", + "Compiling the decoder to CPU ...\n" + ] + }, { "data": { "text/plain": [ "'Das Haus ist wunderbar.'" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -884,16 +904,9 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "id": "c0f096f2-0cd3-40ab-9661-1c7209c6a670", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:48:00.053145Z", - "iopub.status.busy": "2023-03-12T19:48:00.052965Z", - "iopub.status.idle": "2023-03-12T19:48:00.056985Z", - "shell.execute_reply": "2023-03-12T19:48:00.056530Z", - "shell.execute_reply.started": "2023-03-12T19:48:00.053133Z" - }, "tags": [] }, "outputs": [ @@ -906,7 +919,7 @@ " 'translation_en_to_ro']" ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -918,16 +931,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "id": "ebf97e9c-d1e1-4908-8a3a-bad24ee3a34f", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:48:00.057929Z", - "iopub.status.busy": "2023-03-12T19:48:00.057694Z", - "iopub.status.idle": "2023-03-12T19:48:00.399891Z", - "shell.execute_reply": "2023-03-12T19:48:00.399417Z", - "shell.execute_reply.started": "2023-03-12T19:48:00.057914Z" - }, "tags": [] }, "outputs": [ @@ -937,7 +943,7 @@ "[{'translation_text': \"Qu'est-ce qu'un modèle de séquence à séquence?\"}]" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -967,26 +973,26 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "id": "59df852f-e9bb-4f96-91d9-23f4e4577ea0", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:48:00.400753Z", - "iopub.status.busy": "2023-03-12T19:48:00.400401Z", - "iopub.status.idle": "2023-03-12T19:48:04.459054Z", - "shell.execute_reply": "2023-03-12T19:48:04.458437Z", - "shell.execute_reply.started": "2023-03-12T19:48:00.400729Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "data": { "text/plain": [ "[{'label': 'nl', 'score': 0.994126558303833}]" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1018,19 +1024,19 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "id": "87f633c0-3334-4ef0-942a-5bdec807568d", "metadata": { - "execution": { - "iopub.execute_input": "2023-03-12T19:48:04.459956Z", - "iopub.status.busy": "2023-03-12T19:48:04.459670Z", - "iopub.status.idle": "2023-03-12T19:48:07.863986Z", - "shell.execute_reply": "2023-03-12T19:48:07.863615Z", - "shell.execute_reply.started": "2023-03-12T19:48:04.459935Z" - }, "tags": [] }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the model to CPU ...\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -1070,6 +1076,14 @@ "- Notebook: [Post Training Quantization of a question-answering model](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb)\n", "- Examples: [Quantization Aware Training examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0606fb02-2367-4573-8461-7fc8c065ece6", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1088,7 +1102,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/notebooks/openvino/question_answering_quantization.ipynb b/notebooks/openvino/question_answering_quantization.ipynb index 782a48ff45..ba4a84ca38 100644 --- a/notebooks/openvino/question_answering_quantization.ipynb +++ b/notebooks/openvino/question_answering_quantization.ipynb @@ -846,9 +846,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "FP32 model size: 436.07 MB\n", - "INT8 model size: 182.41 MB\n", - "INT8 size decrease: 2.39x\n" + "FP32 model size: 436.50 MB\n", + "INT8 model size: 181.84 MB\n", + "INT8 size decrease: 2.4x\n" ] } ], @@ -858,7 +858,7 @@ " Return OpenVINO or PyTorch model size in Mb.\n", " Arguments:\n", " model_folder:\n", - " Directory containing a pytorch_model.bin for a PyTorch model, and an openvino_model.xml/.bin for an OpenVINO model.\n", + " Directory containing a model.safetensors for a PyTorch model, and an openvino_model.xml/.bin for an OpenVINO model.\n", " framework:\n", " Define whether the model is a PyTorch or an OpenVINO model.\n", " \"\"\"\n", @@ -866,7 +866,7 @@ " model_path = Path(model_folder) / \"openvino_model.xml\"\n", " model_size = model_path.stat().st_size + model_path.with_suffix(\".bin\").stat().st_size\n", " elif framework.lower() == \"pytorch\":\n", - " model_path = Path(model_folder) / \"pytorch_model.bin\"\n", + " model_path = Path(model_folder) / \"model.safetensors\"\n", " model_size = model_path.stat().st_size\n", " model_size /= 1000 * 1000\n", " return model_size\n", From 2b186f2703b3f3102c05abe2123f004094d5262e Mon Sep 17 00:00:00 2001 From: Ofir Zafrir Date: Fri, 15 Mar 2024 22:11:02 +0200 Subject: [PATCH 28/30] Add Phi-2 on Intel's MTL iGPU demo notebook (#606) * Add phi-2 notebook * notebook update * remove stateful setting use default instead --- notebooks/openvino/phi-2_on_mtl.ipynb | 583 ++++++++++++++++++++++++++ 1 file changed, 583 insertions(+) create mode 100644 notebooks/openvino/phi-2_on_mtl.ipynb diff --git a/notebooks/openvino/phi-2_on_mtl.ipynb b/notebooks/openvino/phi-2_on_mtl.ipynb new file mode 100644 index 0000000000..88f0387f05 --- /dev/null +++ b/notebooks/openvino/phi-2_on_mtl.ipynb @@ -0,0 +1,583 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "aeb16663-be53-4260-b62d-44611b6771ec", + "metadata": {}, + "source": [ + "# Chat and Code with Phi-2 with OpenVINO and 🤗 Optimum on Intel Meteor Lake iGPU\n", + "In this notebook we will show how to export and apply weight only quantization on Phi-2 to 4 bits.\n", + "Then using the quantized model we will show how to generate code completions with the model running on Intel Meteor Lake iGPU presenting a good experience of running GenAI locally on Intel PC marking the start of the AIPC Era!\n", + "Then we will show how to talk with Phi-2 in a ChatBot demo running completely locally on your Laptop!\n", + "\n", + "[Phi-2](https://huggingface.co/microsoft/phi-2) is a 2.7 billion-parameter language model trained by Microsoft. Microsoft in the model's release [blog post](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) states that Phi-2:\n", + "> demonstrates outstanding reasoning and language understanding capabilities, showcasing state-of-the-art performance among base language models with less than 13 billion parameters. On complex benchmarks Phi-2 matches or outperforms models up to 25x larger, thanks to new innovations in model scaling and training data curation." + ] + }, + { + "cell_type": "markdown", + "id": "03cb49cf-bc6f-4702-a61f-227b352404cb", + "metadata": {}, + "source": [ + "## Install dependencies\n", + "Make sure you have the latest GPU drivers installed on your machine: https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html.\n", + "\n", + "We will start by installing the dependencies, that can be done by uncommenting the following cell and run it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96d8203c-34c9-41a2-95bd-3891533840a1", + "metadata": {}, + "outputs": [], + "source": [ + "# ! pip install optimum[openvino,nncf] torch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5980ce40-0be1-48c1-941a-92c484d4da31", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from transformers import AutoTokenizer\n", + "from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig" + ] + }, + { + "cell_type": "markdown", + "id": "48b81857-a095-43a3-8c8d-4c880b743a6e", + "metadata": {}, + "source": [ + "## Configuration\n", + "Here we will configure which model to load and other attributes. We will explain everything 😄\n", + "* `model_name`: the name or path of the model we want to export and quantize, can be either on the 🤗 Hub or a local directory on your laptop.\n", + "* `save_name`: directory where the exported & quantized model will be saved.\n", + "* `precision`: the compute data type we will use for inference of the model, can be either `f32` or `f16`. We use FP32 precision due to Phi-2 overflow issues in FP16.\n", + "* `quantization_config`: here we set the attributes for the weight only quantization algorithm:\n", + " * `bits`: number of bits to use for quantization, can be either `8` or `4`.\n", + " * `sym`: whether to use symmetric quantization or not, can be either `True` or `False`.\n", + " * `group_size`: number of weights to group together for quantization. We use groups of 128 to ensure no accuracy degradation.\n", + " * `ratio`: the ratio of the model to quantize to #`bits`. The rest will be quantize to the default bits number, `8`.\n", + "* `device`: the device to use for inference, can be either `cpu` or `gpu`.\n", + "* `stateful`: Optimize model by setting the KV cache as part of the models state instead of as an input\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "800cd7a3-a21d-4a0a-9d73-2a2d08646f99", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = 'microsoft/phi-2'\n", + "save_name = './phi-2-woq4'\n", + "precision = 'f32'\n", + "quantization_config = OVWeightQuantizationConfig(\n", + " bits=4,\n", + " sym=False,\n", + " group_size=128,\n", + " ratio=0.8,\n", + ")\n", + "device = 'gpu'" + ] + }, + { + "cell_type": "markdown", + "id": "1f398868-93d7-4c2d-9591-9bac8e9b701c", + "metadata": {}, + "source": [ + "With this configuration we expect the model size to reduce to around to 1.62GB: $0.8 \\times 2.7{\\times}10^3 \\times \\frac{1}{2}\\text{B} + 0.2 * 2.7{\\times}10^3 \\times 1\\text{B} = 1.62{\\times}10^3\\text{B} = 1.62\\text{GB}$" + ] + }, + { + "cell_type": "markdown", + "id": "d994997d-344c-4d6c-ab08-f78ecb7f56ec", + "metadata": {}, + "source": [ + "## Export & quantize\n", + "OpenVINO together with 🤗 Optimum enables you to load, export and quantize a model in a single `from_pretrained` call making the process as simple as possible.\n", + "Then, we will save the exported & quantized model locally on our laptop. If the model was already exported and saved before we will load the locally saved model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03a308c6-27e7-4926-8ac4-4fa0c1ca68d2", + "metadata": {}, + "outputs": [], + "source": [ + "# Load kwargs\n", + "load_kwargs = {\n", + " 'device': device,\n", + " 'ov_config': {\n", + " \"PERFORMANCE_HINT\": \"LATENCY\",\n", + " \"INFERENCE_PRECISION_HINT\": precision,\n", + " \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n", + " },\n", + " 'compile': False,\n", + " 'quantization_config': quantization_config\n", + "}\n", + "\n", + "# Check whether the model was already exported\n", + "saved = os.path.exists(save_name)\n", + "\n", + "model = OVModelForCausalLM.from_pretrained(\n", + " model_name if not saved else save_name,\n", + " export=not saved,\n", + " **load_kwargs,\n", + ")\n", + "\n", + "# Load tokenizer to be used with the model\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name if not saved else save_name)\n", + "\n", + "# Save the exported model locally\n", + "if not saved:\n", + " model.save_pretrained(save_name)\n", + " tokenizer.save_pretrained(save_name)\n", + "\n", + "# TODO Optional: export to huggingface/hub\n", + "\n", + "model_size = os.stat(os.path.join(save_name, 'openvino_model.bin')).st_size / 1024 ** 3\n", + "print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')" + ] + }, + { + "cell_type": "markdown", + "id": "592e118d-e8bb-491f-92b2-d0418e19158c", + "metadata": {}, + "source": [ + "We can see the model size was reduced to 1.7GB as expected. After loading the model we can switch the model between devices using `model.to('gpu')` for example.\n", + "After we have finished to configure everything, we can compile the model by calling `model.compile()` and the model will be ready for usage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cef4dc0-191e-4755-a639-c3e8adbd18a2", + "metadata": {}, + "outputs": [], + "source": [ + "model.compile()" + ] + }, + { + "cell_type": "markdown", + "id": "dd3c467e-3bbb-4265-9075-1c6688af2f92", + "metadata": {}, + "source": [ + "## Generate using the exported model\n", + "We will now show an example where we will use our quantized Phi-2 to generate code in Python. \n", + "Phi-2 knows how to do code completions where the model is given a function's signature and its docstring and the model will generate the implementation of the function.\n", + "\n", + "In our example we have taken one of the samples from the test set of HumanEval dataset. \n", + "HumanEval is a code completion dataset used to train and benchmark models on code completion in Python. \n", + "Phi-2 has scored a remarkable result on the HumanEval dataset and is an excellent model to use for code completions.\n", + "\n", + "Note: the first time you run the model might take more time due to loading and compilation overheads of the first inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b4ea738-7db5-490e-9338-d6420b77796c", + "metadata": {}, + "outputs": [], + "source": [ + "sample = \"\"\"from typing import List\n", + "\n", + "\n", + "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n", + " \\\"\\\"\\\" Check if in given list of numbers, are any two numbers closer to each other than\n", + " given threshold.\n", + " >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n", + " False\n", + " >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n", + " True\n", + " \\\"\\\"\\\"\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14ffe7f9-7d93-4a49-95d8-5f2a4e400cfe", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import TextStreamer\n", + "\n", + "# Tokenize the sample\n", + "inputs = tokenizer([sample], return_tensors='pt')\n", + "\n", + "# Call generate on the inputs\n", + "out = model.generate(\n", + " **inputs,\n", + " max_new_tokens=128,\n", + " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", + " pad_token_id=tokenizer.eos_token_id,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3f8aa25c-de59-4e79-9a1f-c03ec76d206a", + "metadata": {}, + "source": [ + "## Chatbot demo\n", + "We will continue to build a chatbot demo running with Gradio using the model we just exported and quantized.\n", + "The chatbot will be rather simple where the user will input a message and the model will reply to the user by generating text using the entire chat history as the input to the model.\n", + "\n", + "A lot of models that were trained for the chatbot use case have been trained with special tokens to tell the model who is the current speaker and with a special system message. \n", + "Phi-2 wasn't trained specifically for the chatbot use case and doesn't have any special tokens either, however, it has seen chats in the training data and therefore is suited for that use case.\n", + "\n", + "The chat template we will use is rather simple:\n", + "```\n", + "User: \n", + "Assistant: \n", + "User: \n", + "...\n", + "```\n", + "\n", + "We will start by writing the core function of the chatbot that receives the entire history of the chat and generates the assistant's response.\n", + "To support this core function we will build a few assistant functions to prepare the input for the model and to stop generation in time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e81d125-ff47-4122-853d-11a2763db146", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from threading import Thread\n", + "\n", + "from transformers import (\n", + " TextIteratorStreamer,\n", + " StoppingCriteria,\n", + " StoppingCriteriaList,\n", + " GenerationConfig,\n", + ")\n", + "\n", + "\n", + "# Copied and modified from https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/generation.py#L13\n", + "class SuffixCriteria(StoppingCriteria):\n", + " def __init__(self, start_length, eof_strings, tokenizer, check_fn=None):\n", + " self.start_length = start_length\n", + " self.eof_strings = eof_strings\n", + " self.tokenizer = tokenizer\n", + " if check_fn is None:\n", + " check_fn = lambda decoded_generation: any(\n", + " [decoded_generation.endswith(stop_string) for stop_string in self.eof_strings]\n", + " )\n", + " self.check_fn = check_fn\n", + "\n", + " def __call__(self, input_ids, scores, **kwargs):\n", + " \"\"\"Returns True if generated sequence ends with any of the stop strings\"\"\"\n", + " decoded_generations = self.tokenizer.batch_decode(input_ids[:, self.start_length :])\n", + " return all([self.check_fn(decoded_generation) for decoded_generation in decoded_generations])\n", + "\n", + "\n", + "def is_partial_stop(output, stop_str):\n", + " \"\"\"Check whether the output contains a partial stop str.\"\"\"\n", + " for i in range(0, min(len(output), len(stop_str))):\n", + " if stop_str.startswith(output[-i:]):\n", + " return True\n", + " return False\n", + "\n", + "\n", + "\n", + "# Set the chat template to the tokenizer. The chat template implements the simple template of\n", + "# User: content\n", + "# Assistant: content\n", + "# ...\n", + "# Read more about chat templates here https://huggingface.co/docs/transformers/main/en/chat_templating\n", + "tokenizer.chat_template = \"{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}\"\n", + "\n", + "\n", + "def prepare_history_for_model(history):\n", + " \"\"\"\n", + " Converts the history to a tokenized prompt in the format expected by the model.\n", + " Params:\n", + " history: dialogue history\n", + " Returns:\n", + " Tokenized prompt\n", + " \"\"\"\n", + " messages = []\n", + " for idx, (user_msg, model_msg) in enumerate(history):\n", + " # skip the last assistant message if its empty, the tokenizer will do the formating\n", + " if idx == len(history) - 1 and not model_msg:\n", + " messages.append({'role': 'User', 'content': user_msg})\n", + " break\n", + " if user_msg:\n", + " messages.append({'role': 'User', 'content': user_msg})\n", + " if model_msg:\n", + " messages.append({'role': 'Assistant', 'content': model_msg})\n", + " input_token = tokenizer.apply_chat_template(\n", + " messages,\n", + " add_generation_prompt=True,\n", + " tokenize=True,\n", + " return_tensors=\"pt\",\n", + " return_dict=True\n", + " )\n", + " return input_token\n", + "\n", + "\n", + "def generate(history, temperature, max_new_tokens, top_p, repetition_penalty):\n", + " \"\"\"\n", + " Generates the assistant's reponse given the chatbot history and generation parameters\n", + "\n", + " Params:\n", + " history: conversation history formated in pairs of user and assistant messages `[user_message, assistant_message]`\n", + " temperature: parameter for control the level of creativity in AI-generated text.\n", + " By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse.\n", + " max_new_tokens: The maximum number of tokens we allow the model to generate as a response.\n", + " top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability.\n", + " repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.\n", + " Yields:\n", + " Updated history and generation status.\n", + " \"\"\"\n", + " start = time.perf_counter()\n", + " # Construct the input message string for the model by concatenating the current system message and conversation history\n", + " # Tokenize the messages string\n", + " inputs = prepare_history_for_model(history)\n", + " input_length = inputs['input_ids'].shape[1]\n", + " # truncate input in case it is too long.\n", + " # TODO improve this\n", + " if input_length > 2000:\n", + " history = [history[-1]]\n", + " inputs = prepare_history_for_model(history)\n", + " input_length = inputs['input_ids'].shape[1]\n", + "\n", + " prompt_char = '▌'\n", + " history[-1][1] = prompt_char\n", + " yield (history, 'Status: Generating...')\n", + " \n", + " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", + "\n", + " # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n", + " stop_str = f'\\nUser:'\n", + " stopping_criteria = StoppingCriteriaList([SuffixCriteria(input_length, [stop_str], tokenizer)])\n", + " # Prepare input for generate\n", + " generation_config = GenerationConfig(\n", + " max_new_tokens=max_new_tokens,\n", + " do_sample=temperature > 0.0,\n", + " temperature=temperature if temperature > 0.0 else 1.0,\n", + " repetition_penalty=repetition_penalty,\n", + " top_p=top_p,\n", + " eos_token_id=[tokenizer.eos_token_id],\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " )\n", + " generate_kwargs = dict(\n", + " streamer=streamer,\n", + " generation_config=generation_config,\n", + " stopping_criteria=stopping_criteria,\n", + " ) | inputs\n", + "\n", + " t1 = Thread(target=model.generate, kwargs=generate_kwargs)\n", + " t1.start()\n", + "\n", + " # Initialize an empty string to store the generated text.\n", + " partial_text = \"\"\n", + " for new_text in streamer:\n", + " partial_text += new_text\n", + " history[-1][1] = partial_text + prompt_char\n", + " # We don't yield the generated text until we are sure it is not the stop string\n", + " pos = partial_text.rfind(stop_str)\n", + " if pos != -1:\n", + " partial_text = partial_text[:pos]\n", + " break\n", + " elif is_partial_stop(partial_text, stop_str):\n", + " continue\n", + " yield (history, 'Status: Generating...')\n", + " history[-1][1] = partial_text\n", + " generation_time = time.perf_counter() - start\n", + " yield (history, f'Generation time: {generation_time:.2f} sec')" + ] + }, + { + "cell_type": "markdown", + "id": "29fe1ae5-9929-4789-9293-612b2062e2a8", + "metadata": {}, + "source": [ + "Next we will create the actual demo using Gradio. The layout will be very simple, a chatbot window followed by a text prompt and some controls.\n", + "We will also include sliders to adjust generation parameters like temperature and length of response we allow the model to generate.\n", + "\n", + "To install Gradio dependency, please uncomment the following cell and run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b61a9a9f", + "metadata": {}, + "outputs": [], + "source": [ + "# ! pip install gradio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ae1aa4e-3539-49a1-8f32-62b818ee1002", + "metadata": {}, + "outputs": [], + "source": [ + "import gradio as gr\n", + "\n", + "\n", + "EXAMPLES = [\n", + " [\"What is OpenVINO?\"],\n", + " [\"Can you explain to me briefly what is Python programming language?\"],\n", + " [\"Explain the plot of Cinderella in a sentence.\"],\n", + " [\"Write a Python function to perform binary search over a sorted list. Use markdown to write code\"],\n", + " [\"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"],\n", + "]\n", + "\n", + "\n", + "def add_user_text(message, history):\n", + " \"\"\"\n", + " Add user's message to chatbot history\n", + "\n", + " Params:\n", + " message: current user message\n", + " history: conversation history\n", + " Returns:\n", + " Updated history, clears user message and status\n", + " \"\"\"\n", + " # Append current user message to history with a blank assistant message which will be generated by the model\n", + " history.append([message, None])\n", + " return ('', history)\n", + "\n", + "\n", + "with gr.Blocks(theme=gr.themes.Soft()) as demo:\n", + " gr.Markdown('

Chat with Phi-2 on Meteor Lake iGPU

')\n", + " chatbot = gr.Chatbot()\n", + " with gr.Row():\n", + " msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n", + " status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=25)\n", + " with gr.Row():\n", + " submit = gr.Button(\"Submit\", variant='primary')\n", + " clear = gr.Button(\"Clear\")\n", + " with gr.Accordion(\"Advanced Options:\", open=False):\n", + " with gr.Row():\n", + " with gr.Column():\n", + " temperature = gr.Slider(\n", + " label=\"Temperature\",\n", + " value=0.0,\n", + " minimum=0.0,\n", + " maximum=1.0,\n", + " step=0.05,\n", + " interactive=True,\n", + " )\n", + " max_new_tokens = gr.Slider(\n", + " label=\"Max new tokens\",\n", + " value=128,\n", + " minimum=0,\n", + " maximum=512,\n", + " step=32,\n", + " interactive=True,\n", + " )\n", + " with gr.Column():\n", + " top_p = gr.Slider(\n", + " label=\"Top-p (nucleus sampling)\",\n", + " value=1.0,\n", + " minimum=0.0,\n", + " maximum=1.0,\n", + " step=0.05,\n", + " interactive=True,\n", + " )\n", + " repetition_penalty = gr.Slider(\n", + " label=\"Repetition penalty\",\n", + " value=1.0,\n", + " minimum=1.0,\n", + " maximum=2.0,\n", + " step=0.1,\n", + " interactive=True,\n", + " )\n", + " gr.Examples(\n", + " EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\"\n", + " )\n", + "\n", + " # Sets generate function to be triggered when the user submit a new message\n", + " gr.on(\n", + " triggers=[submit.click, msg.submit],\n", + " fn=add_user_text,\n", + " inputs=[msg, chatbot],\n", + " outputs=[msg, chatbot],\n", + " queue=False,\n", + " ).then(\n", + " fn=generate,\n", + " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty],\n", + " outputs=[chatbot, status],\n", + " concurrency_limit=1,\n", + " queue=True\n", + " )\n", + " \n", + " clear.click(fn=lambda: (None, 'Status: Idle'), inputs=None, outputs=[chatbot, status], queue=False)" + ] + }, + { + "cell_type": "markdown", + "id": "1d1baf09-26f1-40ab-896c-3468b5e89fec", + "metadata": {}, + "source": [ + "That's it, all that is left is to start the demo!\n", + "\n", + "When you're done you can use `demo.close()` to close the demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b73962d-f977-45b7-be3a-32b65e546737", + "metadata": {}, + "outputs": [], + "source": [ + "demo.launch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e26a0bc-6a78-4185-8b0c-7e9450ba5868", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# demo.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 428c7789dfebbd7101642e780815b0bc91f6122e Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Mon, 18 Mar 2024 13:46:55 +0400 Subject: [PATCH 29/30] Revert "Apply bettertransformer by default for causallm (#605)" (#612) This reverts commit 948f99d20a93a066351f7ee6a63b204fc48fc4b4. --- optimum/exporters/openvino/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 8c49994874..dfca80f001 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -293,7 +293,7 @@ def export_pytorch( logger.info(f"Using framework PyTorch: {torch.__version__}") output = Path(output) - if ensure_export_task_support_stateful(config.task): + if stateful: # Trigger bettertransformer together with stateful model because OpenVINO HW-dependent transformations expect # both of them are applied to demonstrate the best performance. # TODO: Consider applying bettertransformer regardless of stateful flag -- requires additional validation. From e036ce6ce52e37666489e4713debbf2cc527963f Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 18 Mar 2024 15:26:44 +0100 Subject: [PATCH 30/30] Add eval method for OVModels (#591) --- optimum/intel/openvino/modeling_base.py | 3 +++ tests/openvino/test_modeling.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 15f1fc4f1c..58d1ce6c73 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -448,6 +448,9 @@ def half(self): self.request = None return self + def eval(self): + return self + def forward(self, *args, **kwargs): raise NotImplementedError diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 9df6c73214..153fafdf2b 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -272,6 +272,7 @@ def test_compare_to_transformers(self, model_arch): def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForSequenceClassification.from_pretrained(model_id, export=True, compile=False) + model.eval() tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("text-classification", model=model, tokenizer=tokenizer) text = "This restaurant is awesome" @@ -345,6 +346,7 @@ def test_compare_to_transformers(self, model_arch): def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForQuestionAnswering.from_pretrained(model_id, export=True) + model.eval() tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("question-answering", model=model, tokenizer=tokenizer) question = "What's my name?" @@ -411,6 +413,7 @@ def test_compare_to_transformers(self, model_arch): def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForTokenClassification.from_pretrained(model_id, export=True) + model.eval() tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("token-classification", model=model, tokenizer=tokenizer) outputs = pipe("My Name is Arthur and I live in Lyon.") @@ -460,6 +463,7 @@ def test_compare_to_transformers(self, model_arch): def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForFeatureExtraction.from_pretrained(model_id, export=True) + model.eval() tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("feature-extraction", model=model, tokenizer=tokenizer) outputs = pipe("My Name is Arthur and I live in Lyon.") @@ -568,6 +572,7 @@ def test_pipeline(self, model_arch): model = OVModelForCausalLM.from_pretrained( model_id, export=True, use_cache=False, compile=False, **model_kwargs ) + model.eval() model.config.encoder_no_repeat_ngram_size = 0 model.to("cpu") model.half() @@ -758,6 +763,7 @@ def test_compare_to_transformers(self, model_arch): def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForMaskedLM.from_pretrained(model_id, export=True) + model.eval() tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer) outputs = pipe(f"This is a {tokenizer.mask_token}.") @@ -815,6 +821,7 @@ def test_compare_to_transformers(self, model_arch): def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForImageClassification.from_pretrained(model_id, export=True) + model.eval() preprocessor = AutoFeatureExtractor.from_pretrained(model_id) pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor) outputs = pipe("http://images.cocodataset.org/val2017/000000039769.jpg") @@ -911,6 +918,7 @@ def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, compile=False) + model.eval() model.half() model.to("cpu") model.compile() @@ -1044,6 +1052,7 @@ def test_compare_to_transformers(self, model_arch): def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForAudioClassification.from_pretrained(model_id, export=True) + model.eval() preprocessor = AutoFeatureExtractor.from_pretrained(model_id) pipe = pipeline("audio-classification", model=model, feature_extractor=preprocessor) outputs = pipe([np.random.random(16000)]) @@ -1354,6 +1363,7 @@ def test_compare_to_transformers(self, model_arch): def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True) + model.eval() processor = get_preprocessor(model_id) GenerationConfig.from_pretrained(model_id) pipe = pipeline(