From a48e0ca3177075235c2a67ee770e07bd751e49dd Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 2 Apr 2024 18:43:31 +0400 Subject: [PATCH 01/25] Rework inputs preparation for OVModelForCausalLM (#620) * refactor OVModelForCausalLM class * rework prepare_inputs_for_generation for OVModelForCausalLM * refactoring * Apply suggestions from code review * fix position ids and add tests --- optimum/intel/openvino/modeling_decoder.py | 96 +++++++++------------- tests/openvino/test_modeling.py | 5 ++ 2 files changed, 45 insertions(+), 56 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 10f0359a24..4b156eda9e 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -120,6 +120,7 @@ def __init__( self._original_model = self.model.clone() # keep original model for serialization self._pkv_precision = Type.f32 self.next_beam_idx = None + self._past_length = 0 self.update_pkv_precision() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) @@ -356,19 +357,14 @@ def prepare_inputs( position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> Dict: - if self.use_cache and past_key_values is not None: - input_ids = input_ids[:, -1:] - batch_size = input_ids.shape[0] if self.config.model_type == "bloom": batch_size *= self.config.num_attention_heads inputs = {} - past_len = 0 if not self.stateful: if past_key_values is not None: if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: - past_len = past_key_values[0][1].shape[-2] if self._pkv_precision == Type.bf16: # numpy does not support bf16, pretending f16, should change to bf16 past_key_values = tuple( @@ -381,8 +377,6 @@ def prepare_inputs( past_key_values = tuple( past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer ) - else: - past_len = past_key_values[0].shape[-2] # Add the past_key_values to the decoder inputs inputs = dict(zip(self.key_value_input_names, past_key_values)) @@ -411,6 +405,8 @@ def prepare_inputs( # Set initial value for the next beam_idx input that will be used at the current iteration # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used self.next_beam_idx = np.arange(batch_size, dtype=int) + self._past_length = 0 + past_len = self._get_past_length(past_key_values) inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed @@ -432,7 +428,7 @@ def prepare_inputs( position_ids = np.cumsum(attention_mask, axis=1) - 1 position_ids[attention_mask == 0] = 1 if past_key_values: - position_ids = np.expand_dims(position_ids[:, -1], axis=-1) + position_ids = position_ids[:, -input_ids.shape[1] :] inputs["position_ids"] = position_ids @@ -470,6 +466,7 @@ def forward( # the first condition at the function beginning above. # It should be something that is not None and it should be True when converted to Boolean. past_key_values = ((),) + self._past_length += input_ids.shape[1] if not self.stateful: if self.use_cache: @@ -485,19 +482,32 @@ def forward( return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation + # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly attention_mask = kwargs.get("attention_mask", None) use_cache = kwargs.get("use_cache", None) + if past_key_values is not None: + past_len = self._get_past_length(past_key_values) + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_len) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_len < input_ids.shape[1]: + input_ids = input_ids[:, past_len:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: + if attention_mask is not None and position_ids is None and "position_ids" in self.input_names: # create position_ids on the fly for batch generation position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past_key_values: - position_ids = position_ids[:, -1].unsqueeze(-1) + position_ids = position_ids[:, -input_ids.shape[1] :] return { "input_ids": input_ids, @@ -507,6 +517,24 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "attention_mask": attention_mask, } + def _get_past_length(self, past_key_values=None): + if past_key_values is None: + return 0 + if self.stateful: + return self._past_length + if self.config.model_type in MULTI_QUERY_ATTN_MODELS: + return past_key_values[0].shape[-2] + seq_length_dim = -2 + if self.config.model_type == "chatglm": + seq_length_dim = 0 + elif self.config.model_type == "qwen": + seq_length_dim = 1 + # input is tuple of pairs + if isinstance(past_key_values[0], (tuple, list)): + return past_key_values[0][1].shape[seq_length_dim] + # past key values comes after flattening + return past_key_values[1].shape[seq_length_dim] + # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor @@ -573,10 +601,6 @@ def _from_pretrained( model_type = config.model_type.replace("_", "-") if model_type == "bloom": init_cls = OVBloomForCausalLM - elif model_type == "mpt": - init_cls = OVMPTForCausalLM - elif model_type == "opt": - init_cls = OVOPTForCausalLM elif model_type == "gpt-bigcode": init_cls = OVGPTBigCodeForCausalLM else: @@ -630,22 +654,12 @@ def _from_pretrained( class OVBloomForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - # only last token for input_ids if past is not None if past_key_values and not self.stateful: # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed if past_key_values[0][0].shape[0] == input_ids.shape[0]: past_key_values = self._convert_to_bloom_cache(past_key_values) - - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": None, - "attention_mask": attention_mask, - } + return super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, **kwargs) # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM._reorder_cache def _reorder_cache( @@ -712,36 +726,6 @@ def _convert_to_standard_cache( ) -class OVOPTForCausalLM(OVModelForCausalLM): - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": None, - "attention_mask": attention_mask, - } - - -class OVMPTForCausalLM(OVModelForCausalLM): - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": None, - "attention_mask": attention_mask, - } - - class OVGPTBigCodeForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM._reorder_cache def _reorder_cache( diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 65094ae221..f54305113f 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -632,6 +632,11 @@ def test_multiple_inputs(self, model_arch): outputs = model.generate(**tokens, generation_config=generation_config) self.assertIsInstance(outputs, torch.Tensor) self.assertEqual(outputs.shape[0], 3) + # test that generation result is reproducible + outputs2 = model.generate(**tokens, generation_config=generation_config) + self.assertIsInstance(outputs2, torch.Tensor) + self.assertEqual(outputs2.shape[0], 3) + self.assertTrue(torch.allclose(outputs2, outputs)) del model gc.collect() From 7c1d38b8165c7bd786eaff427ed73931349b0d18 Mon Sep 17 00:00:00 2001 From: Ofir Zafrir Date: Thu, 4 Apr 2024 23:01:48 +0300 Subject: [PATCH 02/25] Add speculative decoding example to OpenVINO quantized generation notebook (#635) * Add assisted generation example to notebook * Bug fixes * Add acceptance rate measurement * Fix explanation of AR formula * Fix norm AR calc to ignore last window * Minor fixes --- .../openvino/quantized_generation_demo.ipynb | 406 +++++++++++++++++- 1 file changed, 389 insertions(+), 17 deletions(-) diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb index 582b463346..7671064088 100644 --- a/notebooks/openvino/quantized_generation_demo.ipynb +++ b/notebooks/openvino/quantized_generation_demo.ipynb @@ -223,14 +223,346 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "5179c53d-0436-4ee9-9367-2625a8d3e262", + "metadata": {}, + "source": [ + "## Assisted generation\n", + "Auto-regressive language models generate outputs token by token. Assisted generation (AG) is a general name for a group of methods that speculate the next generated tokens and then use the language model to validate the speculated tokens and accept/reject them.\n", + "AG is a great method to accelerate LMs running locally on your computer as it reduces memory bandwidth requirements and can speedup generation by 1.5x-3x without any accuracy degradation.\n", + "You can read more on assisted generation here in this great [blog post](https://huggingface.co/blog/assisted-generation).\n", + "\n", + "\n", + "In this section we will present how to run Phi-2 with two AG methods that are well supported within 🤗 transformers: Prompt Lookahead Decoding (PLD) and Speculative Decoding.\n", + "\n", + "To use Phi-2 with AG we will need to export the model again with `stateful=False` as OpenVINO stateful models don't support speculative decoding yet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adc4484f-8234-4206-9f28-7a02a7444e25", + "metadata": {}, + "outputs": [], + "source": [ + "# Save the model in a different directory to set it apart from the stateful model\n", + "save_name = model_name.split(\"/\")[-1] + \"_openvino_stateless\"\n", + "\n", + "load_kwargs[\"ov_config\"][\"CACHE_DIR\"] = os.path.join(save_name, \"model_cache\")\n", + "\n", + "# Check whether the model was already exported\n", + "saved = os.path.exists(save_name)\n", + "\n", + "# We can use the same loading attributes, the only differece is the stateful attribute\n", + "stateless_model = OVModelForCausalLM.from_pretrained(\n", + " model_name if not saved else save_name,\n", + " export=not saved,\n", + " stateful=False,\n", + " **load_kwargs,\n", + ")\n", + "\n", + "# Save the exported model locally\n", + "if not saved:\n", + " stateless_model.save_pretrained(save_name)\n", + " tokenizer.save_pretrained(save_name)\n", + "\n", + "stateless_model.compile()" + ] + }, + { + "cell_type": "markdown", + "id": "98d34b03-55e0-4606-be26-5722d6868679", + "metadata": {}, + "source": [ + "### Prompt lookahead decoding\n", + "Now we will run the same example from before with PLD enabled. \n", + "PLD speculates tokens by searching the last n-gram (usually 2-gram) in the sequence inside the prompt, if we find a match, we will take the next few tokens (configured with `prompt_lookup_num_tokens`) as our speculation, if a match is not found the code will revert back to auto-regressive generation.\n", + "\n", + "We will run the same example from before with PLD. To enable PLD, we simply pass the `prompt_lookup_num_tokens` key-word argument to the `generate` function.\n", + "Note that PLD can be great when doing code completion as some sequences of tokens tend to repeat themselves in the same order, names of variables, like `for i in range(...):`, etc.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63a2c7f3-3417-4dec-981d-e99387cc18a8", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import TextStreamer\n", + "\n", + "\n", + "# Tokenize the sample\n", + "inputs = tokenizer([sample], return_tensors='pt') \n", + "\n", + "out = stateless_model.generate(\n", + " **inputs,\n", + " max_new_tokens=128,\n", + " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " prompt_lookup_num_tokens=3,\n", + ") " + ] + }, + { + "cell_type": "markdown", + "id": "f0e4e211-e721-48bf-a73f-c987fd3321d3", + "metadata": {}, + "source": [ + "### Speculative decoding\n", + "Speculative Decoding was introduced in the paper [Fast Inference from Transformers via Speculative Decoding](https://arxiv.org/abs/2211.17192).\n", + "In this method the next tokens in the sequence are speculated using another smaller and much faster model which is called a draft model.\n", + "The only constraint we have on the draft model is that it has to have the same vocabulary as the target model, in our case Phi-2.\n", + "Phi-2 and CodeGen models share the same vocabulary and therefore we can use a much smaller and faster CodeGen model as a draft model to Phi-2.\n", + "A common metric for assessing if a draft model is performing well is the acceptance rate.\n", + "The acceptance rate measures how many tokens out of the speculated tokens in each window are accepted by the target model.\n", + "A higher acceptance rate will ensure a higher speedup and therefore it is a very important metric to measure when choosing a draft model.\n", + "\n", + "In this example we will use [CodeGen-350M-Multi](https://huggingface.co/Salesforce/codegen-350M-multi) as a draft model, it has 350M parameters which is ~10x smaller than Phi-2.\n", + "Next, we will prepare our chosen draft model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c996ba6-ef66-42a2-9bb4-2320372e4167", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"Salesforce/codegen-350M-multi\"\n", + "save_name = model_name.split(\"/\")[-1] + \"_openvino_stateless\"\n", + "precision = \"f32\"\n", + "quantization_config = OVWeightQuantizationConfig(\n", + " bits=4,\n", + " sym=False,\n", + " group_size=128,\n", + " ratio=0.8,\n", + ")\n", + "device = \"cpu\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb366707-4b99-4c79-a235-d3c887136965", + "metadata": {}, + "outputs": [], + "source": [ + "# Load kwargs\n", + "load_kwargs = {\n", + " \"device\": device,\n", + " \"ov_config\": {\n", + " \"PERFORMANCE_HINT\": \"LATENCY\",\n", + " \"INFERENCE_PRECISION_HINT\": precision,\n", + " \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n", + " },\n", + " \"compile\": False,\n", + " \"quantization_config\": quantization_config\n", + "}\n", + "\n", + "# Check whether the model was already exported\n", + "saved = os.path.exists(save_name)\n", + "\n", + "asst_model = OVModelForCausalLM.from_pretrained(\n", + " model_name if not saved else save_name,\n", + " export=not saved,\n", + " stateful=False,\n", + " **load_kwargs,\n", + ")\n", + "\n", + "# Save the exported model locally\n", + "if not saved:\n", + " asst_model.save_pretrained(save_name)\n", + " tokenizer.save_pretrained(save_name)\n", + "\n", + "asst_model.compile()" + ] + }, + { + "cell_type": "markdown", + "id": "4a95efed-22ce-43a0-af2a-e27500cfa514", + "metadata": {}, + "source": [ + "We will set the configuration of the draft model to predict 3 tokens at each forward step, we found that this setting works quite well in the current setup." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1466938c-0945-4eb6-a80f-dd165cc5eca1", + "metadata": {}, + "outputs": [], + "source": [ + "asst_model.generation_config.num_assistant_tokens = 3\n", + "asst_model.generation_config.num_assistant_tokens_schedule = \"const\"" + ] + }, + { + "cell_type": "markdown", + "id": "74f6b4c4-4d8a-47fd-8172-6502cc5eef29", + "metadata": {}, + "source": [ + "Next, we will run the same example from before with speculative decoding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a7e1516-6521-4346-bf85-5890341336f0", + "metadata": {}, + "outputs": [], + "source": [ + "out = stateless_model.generate(\n", + " **inputs,\n", + " max_new_tokens=128,\n", + " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " assistant_model=asst_model,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "dab6669b-f3f1-411e-b4b8-31ead823247f", + "metadata": {}, + "source": [ + "Note that in both cases of AG we presented, the generation result is exactly the same as Phi-2 would have generated it without AG!\n", + "\n", + "Like we mentioned before, the acceptance rate (AR) is a very important metric for choosing a draft.\n", + "We would like to make sure that CodeGen has a good AR with Phi-2.\n", + "For that purpose we implemented an easy utility class that uses the inputs' lengths and window sizes to calculate how many tokens were accepted by the target model at each step and calculate the AR using that information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "480d3e69-0899-4fa8-a85a-cd5a2ce23434", + "metadata": {}, + "outputs": [], + "source": [ + "from functools import wraps\n", + "import numpy as np\n", + "\n", + "\n", + "class AcceptanceRateRecorder:\n", + " def __init__(self, model):\n", + " self.model = model\n", + " self.model_forward = None\n", + " self.model_generate = None\n", + " self.seq_lens = []\n", + " self.win_sizes = []\n", + "\n", + " def __enter__(self):\n", + " # wrap forward method\n", + " if len(self.seq_lens) > 0 or len(self.win_sizes) > 0:\n", + " raise RuntimeError(\"Always use a new instance, don't reuse!\")\n", + " self.model_forward = self.model.forward\n", + " \n", + " @wraps(self.model_forward)\n", + " def forward_wrapper(**kwargs):\n", + " self.seq_lens[-1].append(kwargs.get(\"attention_mask\").shape[-1])\n", + " self.win_sizes[-1].append(kwargs.get(\"input_ids\").shape[-1] - 1)\n", + " return self.model_forward(**kwargs)\n", + " \n", + " self.model.forward = forward_wrapper\n", + " \n", + " # wrap generate method\n", + " self.model_generate = self.model.generate\n", + "\n", + " @wraps(self.model_generate)\n", + " def generate_wrapper(*args, **kwargs):\n", + " self.seq_lens.append([])\n", + " self.win_sizes.append([])\n", + " input_ids = args[0] if len(args) > 0 else kwargs.get(\"input_ids\")\n", + " self.seq_lens[-1].append(input_ids.shape[-1])\n", + " out = self.model_generate(*args, **kwargs)\n", + " self.seq_lens[-1].append(out.shape[-1])\n", + " return out\n", + " self.model.generate = generate_wrapper\n", + " return self\n", + "\n", + " def __exit__(self, type, value, traceback):\n", + " self.model.forward = self.model_forward\n", + " self.model.generate = self.model_generate\n", + " self.model_forward = None\n", + " self.model_generate = None\n", + " # Fix first window size\n", + " for ws, sl in zip(self.win_sizes, self.seq_lens):\n", + " ws[0] -= sl[0] - 1\n", + " # Delete first seq_len, not needed anymore\n", + " self.seq_lens = [sl[1:] for sl in self.seq_lens]\n", + " # Add window size for output to ease calculation later\n", + " for ws, sl in zip(self.win_sizes, self.seq_lens):\n", + " ws.append(0) \n", + "\n", + " def acceptance_rate(self, return_mean=True, normalize=False):\n", + " # ar_per_win = ((cur_seq_len - cur_win_size) - (prev_seq_len - prev_win_size) - 1) / prev_win_size\n", + " ar_per_win = []\n", + " for sl, ws in zip(self.seq_lens, self.win_sizes):\n", + " sl = np.array(sl, dtype=np.float64)\n", + " ws = np.array(ws, dtype=np.float64)\n", + " out_lens = sl - ws\n", + " accepted = (out_lens[1:] - out_lens[:-1] - 1)\n", + " ar_per_win.append(np.divide(accepted, ws[:-1],\n", + " out=np.zeros_like(accepted),where=ws[:-1] != 0))\n", + " ar_per_win = np.hstack(ar_per_win)\n", + " # Normalized AR doesn't take into account windows with size 0\n", + " if normalize:\n", + " ar_per_win = ar_per_win[np.nonzero(np.hstack([ws[:-1] for ws in self.win_sizes]))]\n", + " return np.mean(ar_per_win) if return_mean else ar_per_win" + ] + }, + { + "cell_type": "markdown", + "id": "c35f5e0c-5ed6-4011-a295-80a81fea8b8e", + "metadata": {}, + "source": [ + "Now we can use any dataset for text generation task and measure the AR on that dataset.\n", + "Here we use the [HumanEval](https://huggingface.co/datasets/openai_humaneval) dataset for evaluating code generation.\n", + "We run the model with speculative decoding on 30 samples.\n", + "As you will see, we are getting a very good AR of ~75% for the current configuration.\n", + "\n", + "Note that running this test can take a few minutes depending on the number of samples you are evaluating" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "681a4974-43df-4934-8b61-75c3a92b6df1", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "from datasets import load_dataset\n", + "\n", + "dataset_name = \"openai_humaneval\"\n", + "dataset_subset_name = None\n", + "field_name = \"prompt\"\n", + "prompt_template = \"\"\"{text}\"\"\"\n", + "dataset = load_dataset(dataset_name, dataset_subset_name, split=\"test\")[field_name]\n", + "samples_number = 30\n", + "with AcceptanceRateRecorder(stateless_model) as ar_recorder:\n", + " for text in tqdm(dataset[:samples_number]):\n", + " tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors='pt')\n", + " stateless_model.generate(\n", + " **tokenized_prompt,\n", + " max_new_tokens=128,\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " assistant_model=asst_model,\n", + " )\n", + "print(f\"Acceptance rate: {ar_recorder.acceptance_rate() * 100:.2f}%\")" + ] + }, { "cell_type": "markdown", "id": "3f8aa25c-de59-4e79-9a1f-c03ec76d206a", "metadata": {}, "source": [ "## Chatbot demo\n", - "We will continue to build a chatbot demo running with Gradio using the model we just exported and quantized.\n", + "We will continue to build a chatbot demo running with Gradio using the models we just exported and quantized.\n", "The chatbot will be rather simple where the user will input a message and the model will reply to the user by generating text using the entire chat history as the input to the model.\n", + "We will also add an option to accelerate inference using speculative decoding with a draft model as we described in the previous section.\n", "\n", "A lot of models that were trained for the chatbot use case have been trained with special tokens to tell the model who is the current speaker and with a special system message. \n", "Phi-2 wasn't trained specifically for the chatbot use case and doesn't have any special tokens either, however, it has seen chats in the training data and therefore is suited for that use case.\n", @@ -328,7 +660,7 @@ " return input_token\n", "\n", "\n", - "def generate(history, temperature, max_new_tokens, top_p, repetition_penalty):\n", + "def generate(history, temperature, max_new_tokens, top_p, repetition_penalty, assisted):\n", " \"\"\"\n", " Generates the assistant's reponse given the chatbot history and generation parameters\n", "\n", @@ -339,6 +671,7 @@ " max_new_tokens: The maximum number of tokens we allow the model to generate as a response.\n", " top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability.\n", " repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.\n", + " assisted: boolean parameter to enable/disable assisted generation with speculative decoding.\n", " Yields:\n", " Updated history and generation status.\n", " \"\"\"\n", @@ -354,15 +687,15 @@ " inputs = prepare_history_for_model(history)\n", " input_length = inputs['input_ids'].shape[1]\n", "\n", - " prompt_char = '▌'\n", + " prompt_char = \"▌\"\n", " history[-1][1] = prompt_char\n", - " yield (history, \"Status: Generating...\")\n", + " yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n", " \n", " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", "\n", " # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n", - " stop_str = f'\\nUser:'\n", - " stopping_criteria = StoppingCriteriaList([SuffixCriteria(input_length, [stop_str], tokenizer)])\n", + " stop_str = [\"\\nUser:\", \"\\nAssistant:\", \"\\nRules:\", \"\\nQuestion:\"]\n", + " stopping_criteria = StoppingCriteriaList([SuffixCriteria(input_length, stop_str, tokenizer)])\n", " # Prepare input for generate\n", " generation_config = GenerationConfig(\n", " max_new_tokens=max_new_tokens,\n", @@ -379,7 +712,13 @@ " stopping_criteria=stopping_criteria,\n", " ) | inputs\n", "\n", - " t1 = Thread(target=model.generate, kwargs=generate_kwargs)\n", + " if assisted:\n", + " target_generate = stateless_model.generate\n", + " generate_kwargs[\"assistant_model\"] = asst_model\n", + " else:\n", + " target_generate = model.generate\n", + "\n", + " t1 = Thread(target=target_generate, kwargs=generate_kwargs)\n", " t1.start()\n", "\n", " # Initialize an empty string to store the generated text.\n", @@ -387,17 +726,18 @@ " for new_text in streamer:\n", " partial_text += new_text\n", " history[-1][1] = partial_text + prompt_char\n", - " # We don't yield the generated text until we are sure it is not the stop string\n", - " pos = partial_text.rfind(stop_str)\n", + " for s in stop_str:\n", + " if (pos := partial_text.rfind(s)) != -1:\n", + " break\n", " if pos != -1:\n", " partial_text = partial_text[:pos]\n", " break\n", - " elif is_partial_stop(partial_text, stop_str):\n", + " elif any([is_partial_stop(partial_text, s) for s in stop_str]):\n", " continue\n", - " yield (history, \"Status: Generating...\")\n", + " yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n", " history[-1][1] = partial_text\n", " generation_time = time.perf_counter() - start\n", - " yield (history, f'Generation time: {generation_time:.2f} sec')" + " yield history, f'Generation time: {generation_time:.2f} sec', *([gr.update(interactive=True)] * 4)" ] }, { @@ -430,6 +770,11 @@ "source": [ "import gradio as gr\n", "\n", + "try:\n", + " demo.close()\n", + "except:\n", + " pass\n", + "\n", "\n", "EXAMPLES = [\n", " [\"What is OpenVINO?\"],\n", @@ -455,14 +800,29 @@ " return ('', history)\n", "\n", "\n", + "def prepare_for_regenerate(history):\n", + " \"\"\"\n", + " Delete last assistant message to prepare for regeneration\n", + "\n", + " Params:\n", + " history: conversation history\n", + " Returns:\n", + " updated history\n", + " \"\"\" \n", + " history[-1][1] = None\n", + " return history\n", + "\n", + "\n", "with gr.Blocks(theme=gr.themes.Soft()) as demo:\n", " gr.Markdown('

Chat with Phi-2 on Meteor Lake iGPU

')\n", " chatbot = gr.Chatbot()\n", " with gr.Row():\n", + " assisted = gr.Checkbox(value=False, label=\"Assisted Generation\", scale=10)\n", " msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n", - " status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=25)\n", + " status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=15)\n", " with gr.Row():\n", " submit = gr.Button(\"Submit\", variant='primary')\n", + " regenerate = gr.Button(\"Regenerate\")\n", " clear = gr.Button(\"Clear\")\n", " with gr.Accordion(\"Advanced Options:\", open=False):\n", " with gr.Row():\n", @@ -513,12 +873,24 @@ " queue=False,\n", " ).then(\n", " fn=generate,\n", - " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty],\n", - " outputs=[chatbot, status],\n", + " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n", + " outputs=[chatbot, status, msg, submit, regenerate, clear],\n", + " concurrency_limit=1,\n", + " queue=True\n", + " )\n", + " regenerate.click(\n", + " fn=prepare_for_regenerate,\n", + " inputs=chatbot,\n", + " outputs=chatbot,\n", + " queue=True,\n", + " concurrency_limit=1\n", + " ).then(\n", + " fn=generate,\n", + " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n", + " outputs=[chatbot, status, msg, submit, regenerate, clear],\n", " concurrency_limit=1,\n", " queue=True\n", " )\n", - " \n", " clear.click(fn=lambda: (None, \"Status: Idle\"), inputs=None, outputs=[chatbot, status], queue=False)" ] }, @@ -575,7 +947,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.13" } }, "nbformat": 4, From 4ddf9d68f970f720319a46037aaa23ac0e3dd935 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:09:08 +0800 Subject: [PATCH 03/25] Indicate the correct ipex model class in testing (#650) --- tests/ipex/test_modeling.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index bd23d4d093..af59900424 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -103,6 +103,7 @@ def __exit__(self, type, value, traceback): class IPEXModelTest(unittest.TestCase): + IPEX_MODEL_CLASS = IPEXModel SUPPORTED_ARCHITECTURES = ( "albert", "bert", @@ -115,8 +116,6 @@ class IPEXModelTest(unittest.TestCase): "xlm", ) - IPEX_MODEL_CLASS = IPEXModel - @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -150,11 +149,11 @@ def test_pipeline(self, model_arch): class IPEXModelForSequenceClassificationTest(IPEXModelTest): - IPEX_MODEL_CLASS = IPEXModelForTokenClassification + IPEX_MODEL_CLASS = IPEXModelForSequenceClassification class IPEXModelForTokenClassificationTest(IPEXModelTest): - IPEX_MODEL_CLASS = IPEXModelForSequenceClassification + IPEX_MODEL_CLASS = IPEXModelForTokenClassification class IPEXModelForMaskedLMTest(IPEXModelTest): @@ -162,6 +161,7 @@ class IPEXModelForMaskedLMTest(IPEXModelTest): class IPEXModelForQuestionAnsweringTest(unittest.TestCase): + IPEX_MODEL_CLASS = IPEXModelForQuestionAnswering SUPPORTED_ARCHITECTURES = ( "bert", "distilbert", @@ -202,6 +202,7 @@ def test_pipeline(self, model_arch): class IPEXModelForCausalLMTest(unittest.TestCase): + IPEX_MODEL_CLASS = IPEXModelForCausalLM SUPPORTED_ARCHITECTURES = ( "bart", "gpt_bigcode", @@ -382,6 +383,7 @@ def test_pipeline(self, model_arch): class IPEXModelForImageClassificationIntegrationTest(unittest.TestCase): + IPEX_MODEL_CLASS = IPEXModelForImageClassification SUPPORTED_ARCHITECTURES = ( "beit", # "levit", @@ -391,7 +393,6 @@ class IPEXModelForImageClassificationIntegrationTest(unittest.TestCase): "resnet", "vit", ) - IPEX_MODEL_CLASS = IPEXModelForImageClassification @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): From c935a3ddbf0308377702ac1317ac759162da6774 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:15:38 +0800 Subject: [PATCH 04/25] Set use_cache in ipex model tests (#649) --- optimum/intel/ipex/modeling_base.py | 4 ++++ tests/ipex/test_modeling.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 0664a8e6ac..8a7a4f2028 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -89,6 +89,10 @@ def ipex_jit_trace(model, task, use_cache): model.config.return_dict = False + if "past_key_values" in sample_inputs and use_cache: + # Make sure the model will output past_key_values in generation tasks + model.config.use_cache = True + model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True) # Disable repack while jit tracing to reduce the memory ipex._C.disable_jit_linear_repack() diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index af59900424..94a5ca9e16 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -253,7 +253,7 @@ def test_compare_to_transformers(self, model_arch): def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) - model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False) + model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) model.config.encoder_no_repeat_ngram_size = 0 model.to("cpu") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) From fa49187d3b790fd3cdc13b6d9262f037edb4edd3 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Mon, 8 Apr 2024 13:26:06 +0400 Subject: [PATCH 05/25] Add openvino export for InternLM2 and Orion architectures (#628) * support more models in export * add orion * update tests --- optimum/exporters/openvino/__main__.py | 15 ++++++++++++++- optimum/exporters/openvino/convert.py | 2 +- optimum/exporters/openvino/model_configs.py | 20 +++++++++++++++++++- optimum/exporters/openvino/model_patcher.py | 2 +- tests/openvino/test_modeling.py | 4 +++- tests/openvino/utils_tests.py | 3 +++ 6 files changed, 41 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 5d6e31ebac..dbea798f75 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -202,7 +202,6 @@ def main_export( quantization_config = getattr(config, "quantization_config", None) do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq" model_type = config.model_type.replace("_", "-") - if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True elif task not in TasksManager.get_supported_tasks_for_model_type( @@ -220,6 +219,20 @@ def main_export( ) if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED: loading_kwargs["attn_implementation"] = "eager" + # there are some difference between remote and in library representation of past key values for some models, + # for avoiding confusion we disable remote code for them + if ( + trust_remote_code + and model_type in {"falcon", "mpt", "phi"} + and ("with-past" in task or original_task == "auto") + and not custom_export_configs + ): + logger.warning( + f"Model type `{model_type}` export for task `{task}` is not supported for loading with `trust_remote_code=True`" + "using default export configuration, `trust_remote_code` will be disabled. " + "Please provide custom export config if you want load model with remote code." + ) + trust_remote_code = False # Patch the modules to export of GPTQ models w/o GPU if do_gptq_patching: diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 98dd22d824..ccc046ce55 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -345,7 +345,7 @@ def ts_patched_forward(*args, **kwargs): input_dict = dict(zip(keys, tuple_input)) kwargs[input_name] = input_dict outputs = patched_forward(*args, **kwargs) - return tuple(outputs.values()) + return tuple([value if not isinstance(value, list) else tuple(value) for value in outputs.values()]) patcher.patched_forward = ts_patched_forward diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a274b3671d..6f22cf2142 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -74,7 +74,7 @@ def init_model_configs(): @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") -class BaichaunOpenVINOConfig(TextDecoderOnnxConfig): +class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" @@ -400,3 +400,21 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers") +class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers") +class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 2cedf64b0a..bafd467dd4 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -513,5 +513,5 @@ def __init__( ): super().__init__(config, model, model_kwargs) # model has first inference buffers initialization - if self._model.lm_head.first_flag: + if hasattr(self._model.lm_head, "first_flag"): self._model(torch.ones((1, 10), dtype=torch.int64), torch.ones((1, 10), dtype=torch.int64)) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index f54305113f..32fc255a1f 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -524,10 +524,12 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "stablelm", "starcoder2", "phi", + "internlm2", + "orion", ) GENERATION_LENGTH = 100 IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") - REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen") + REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "olmo", "orion") @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index c95444274e..e7f62f1f61 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -50,6 +50,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJModel", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-ibert", + "internlm2": "katuni4ka/tiny-random-internlm2", "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-longt5", "llama": "fxmarty/tiny-llama-fast-tokenizer", @@ -69,6 +70,8 @@ "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", "mt5": "stas/mt5-tiny-random", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", + "olmo": "katuni4ka/tiny-random-olmo", + "orion": "katuni4ka/tiny-random-orion", "pegasus": "hf-internal-testing/tiny-random-pegasus", "pix2struct": "fxmarty/pix2struct-tiny-random", "phi": "echarlaix/tiny-random-PhiForCausalLM", From c290c55670c309a9671fe7a1a86f6bc56deec54f Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 8 Apr 2024 12:40:53 +0200 Subject: [PATCH 06/25] Added passing of CI var to docker. (#648) * Added passing of CI var to docker. * Minor correction. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 83035cf467..69e103466d 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ build_doc_docker_image: doc: build_doc_docker_image @test -n "$(BUILD_DIR)" || (echo "BUILD_DIR is empty." ; exit 1) @test -n "$(VERSION)" || (echo "VERSION is empty." ; exit 1) - docker run -v $(CURRENT_DIR):/doc_folder --workdir=/doc_folder doc_maker \ + docker run -v $(CURRENT_DIR):/doc_folder --workdir=/doc_folder --env CI=$(CI) doc_maker \ doc-builder build optimum.intel /optimum-intel/docs/source/ \ --repo_name optimum-intel \ --build_dir $(BUILD_DIR) \ From 972491991710f8a92cdef35e0914de92a88995a4 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 8 Apr 2024 15:45:54 +0200 Subject: [PATCH 07/25] Replace deprecated transformers is_torch_tpu_available (#652) --- optimum/intel/neural_compressor/trainer.py | 10 +++++----- optimum/intel/neural_compressor/trainer_seq2seq.py | 2 +- optimum/intel/openvino/trainer.py | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py index fc20cdafeb..37e229675a 100644 --- a/optimum/intel/neural_compressor/trainer.py +++ b/optimum/intel/neural_compressor/trainer.py @@ -62,7 +62,7 @@ is_accelerate_available, is_apex_available, is_sagemaker_mp_enabled, - is_torch_tpu_available, + is_torch_xla_available, logging, ) @@ -95,7 +95,7 @@ if is_sagemaker_mp_enabled(): import smdistributed.modelparallel.torch as smp -if is_torch_tpu_available(check_device=False): +if is_torch_xla_available(): import torch_xla.core.xla_model as xm @@ -517,7 +517,7 @@ def _inner_training_loop( if ( args.logging_nan_inf_filter - and not is_torch_tpu_available() + and not is_torch_xla_available() and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) ): # if loss is nan or inf simply add the average of previous logged losses @@ -611,7 +611,7 @@ def _inner_training_loop( logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: # Wait for everyone to get here so we are sure the model has been saved by process 0. - if is_torch_tpu_available(): + if is_torch_xla_available(): xm.rendezvous("load_best_model_at_end") elif args.parallel_mode == ParallelMode.DISTRIBUTED: dist.barrier() @@ -945,7 +945,7 @@ def get_model_sparsity(self): def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): # TODO : can be removed once transformers >= v4.38.0 if self.control.should_log and self.state.global_step > self._globalstep_last_logged: - if is_torch_tpu_available(): + if is_torch_xla_available(): xm.mark_step() logs: Dict[str, float] = {} diff --git a/optimum/intel/neural_compressor/trainer_seq2seq.py b/optimum/intel/neural_compressor/trainer_seq2seq.py index 123ced6030..27540cfb14 100644 --- a/optimum/intel/neural_compressor/trainer_seq2seq.py +++ b/optimum/intel/neural_compressor/trainer_seq2seq.py @@ -17,7 +17,7 @@ import torch from torch import nn from torch.utils.data import Dataset -from transformers.deepspeed import is_deepspeed_zero3_enabled +from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled from transformers.trainer_utils import PredictionOutput from transformers.utils import logging diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index b7d110c96a..6165837f58 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -80,7 +80,7 @@ is_accelerate_available, is_apex_available, is_sagemaker_mp_enabled, - is_torch_tpu_available, + is_torch_xla_available, logging, ) @@ -123,7 +123,7 @@ if is_sagemaker_mp_enabled(): import smdistributed.modelparallel.torch as smp -if is_torch_tpu_available(check_device=False): +if is_torch_xla_available(): import torch_xla.core.xla_model as xm core = Core() @@ -611,7 +611,7 @@ def _inner_training_loop( if ( args.logging_nan_inf_filter - and not is_torch_tpu_available() + and not is_torch_xla_available() and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) ): # if loss is nan or inf simply add the average of previous logged losses @@ -706,7 +706,7 @@ def _inner_training_loop( logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: # Wait for everyone to get here so we are sure the model has been saved by process 0. - if is_torch_tpu_available(): + if is_torch_xla_available(): xm.rendezvous("load_best_model_at_end") elif args.parallel_mode == ParallelMode.DISTRIBUTED: dist.barrier() @@ -799,7 +799,7 @@ def compute_loss(self, model, inputs, return_outputs=False): def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): if self.control.should_log: - if is_torch_tpu_available(): + if is_torch_xla_available(): xm.mark_step() logs: Dict[str, float] = {} From 91e635e023a16390145f95c0ba1da5f851b6185e Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 9 Apr 2024 17:45:07 +0400 Subject: [PATCH 08/25] Fix causal mask update bf16 accuracy issue in gemma (#654) * fix causal mask update bf16 accuracy issue in gemma * update llama config --- optimum/exporters/openvino/model_configs.py | 21 ++++- optimum/exporters/openvino/model_patcher.py | 88 ++++++++++++++++++++- 2 files changed, 107 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 6f22cf2142..90297c8fb3 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -19,7 +19,7 @@ from transformers.utils import is_tf_available from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig -from optimum.exporters.onnx.model_configs import GemmaOnnxConfig +from optimum.exporters.onnx.model_configs import GemmaOnnxConfig, LlamaOnnxConfig from optimum.exporters.tasks import TasksManager from optimum.utils import DEFAULT_DUMMY_SHAPES from optimum.utils.input_generators import ( @@ -34,6 +34,7 @@ BaichuanModelPatcher, ChatGLMModelPatcher, GemmaModelPatcher, + LlamaModelPatcher, MixtralModelPatcher, QwenModelPatcher, ) @@ -274,6 +275,24 @@ def patch_model_for_export( return GemmaModelPatcher(self, model, model_kwargs=model_kwargs) +@register_in_tasks_manager( + "llama", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class LlamaOpenVINOConfig(LlamaOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return LlamaModelPatcher(self, model, model_kwargs=model_kwargs) + + class QwenDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): def __init__( self, diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index bafd467dd4..4213d591d3 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -288,10 +288,74 @@ def __exit__(self, exc_type, exc_value, traceback): block.self_attention.core_attention.forward = block.self_attention.core_attention._orig_forward +# adopted from +# https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/gemma/modeling_gemma.py#L965 +# https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/llama/modeling_llama.py#L1058 +def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_position, **kwargs): + from transformers.modeling_attn_mask_utils import AttentionMaskConverter + + # for compatibility with https://github.com/huggingface/transformers/pull/30047 + current_length = kwargs.get("current_length", cache_position[-1]) + dtype, device = input_tensor.dtype, input_tensor.device + + # using minimum from dtype with larger bandwith (floa32) may lead to overflow + # during execution on platforms with default lower precision (bfloat16, float16) + min_dtype = torch.finfo(torch.float16).min + sequence_length = input_tensor.shape[1] + if hasattr(getattr(self.layers[0], "self_attn", {}), "past_key_value"): # static cache + target_length = self.config.max_position_embeddings + else: # dynamic cache + target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else current_length + 1 + + causal_mask = torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.dim() == 2: + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0) + causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype) + elif attention_mask.dim() == 4: + # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with + # cache. In that case, the 4D attention mask attends to the newest tokens only. + if attention_mask.shape[-2] < cache_position[0] + sequence_length: + offset = cache_position[0] + else: + offset = 0 + mask_shape = attention_mask.shape + mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + class GemmaModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() + # gemma has some accuracy issues with bf16 with transformers >= 4.39 + # fill causal mask in slightly different way for avoid overflow on some platforms + if is_transformers_version(">=", "4.39.0"): + self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask + self._model.model._update_causal_mask = types.MethodType( + _llama_gemma_update_causal_mask, self._model.model + ) + # init inv_freq for torchscript tracing # https://github.com/huggingface/transformers/blob/ed74d97871468f3a4695ede50abdc0b55717a84d/src/transformers/models/gemma/modeling_gemma.py#L108 for layer in self._model.model.layers: @@ -301,6 +365,29 @@ def __enter__(self): rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim) ) + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + if hasattr(self._model.model, "_orig_update_causal_mask"): + self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + + +class LlamaModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + + # llama has some accuracy issues with bf16 with transformers >= 4.39 + # fill causal mask in slightly different way for avoid overflow on some platforms + if is_transformers_version(">=", "4.39.0"): + self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask + self._model.model._update_causal_mask = types.MethodType( + _llama_gemma_update_causal_mask, self._model.model + ) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + if hasattr(self._model.model, "_orig_update_causal_mask"): + self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + SUPPORT_SDPA = is_torch_version(">", "2.1.0") @@ -465,7 +552,6 @@ def _qwen_attention_forward( raise ValueError("Cannot output attentions while using flash-attn") else: outputs += (attn_weight,) - return outputs From e79da779ea7be779e4c6b2f6cb027607a3fe8248 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:21:15 +0200 Subject: [PATCH 09/25] Fix transformers version v4.38 compatibility (#657) --- optimum/intel/neural_compressor/trainer.py | 7 ++++++- optimum/intel/openvino/trainer.py | 6 +++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py index 37e229675a..b6089746e8 100644 --- a/optimum/intel/neural_compressor/trainer.py +++ b/optimum/intel/neural_compressor/trainer.py @@ -62,7 +62,6 @@ is_accelerate_available, is_apex_available, is_sagemaker_mp_enabled, - is_torch_xla_available, logging, ) @@ -73,6 +72,12 @@ from .configuration import INCConfig +if is_transformers_version(">=", "4.39.0"): + from transformers.utils import is_torch_xla_available +else: + from transformers.utils import is_torch_tpu_available as is_torch_xla_available + + if is_accelerate_available(): from accelerate import __version__ as accelerate_version from accelerate import skip_first_batches diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index 6165837f58..25b7f35d07 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -80,7 +80,6 @@ is_accelerate_available, is_apex_available, is_sagemaker_mp_enabled, - is_torch_xla_available, logging, ) @@ -101,6 +100,11 @@ ) +if is_transformers_version(">=", "4.39.0"): + from transformers.utils import is_torch_xla_available +else: + from transformers.utils import is_torch_tpu_available as is_torch_xla_available + if is_accelerate_available(): from accelerate import __version__ as accelerate_version from accelerate import skip_first_batches From e7108defeb514dd6a602ccee1911948ee7d9e5b2 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 10 Apr 2024 10:41:01 +0200 Subject: [PATCH 10/25] Schedule nightly tests (#653) * Schedule nightly slow tests * set test to slow * merge tests * fix format * fix test for chatglm --- .github/workflows/test_openvino.yml | 2 +- .github/workflows/test_openvino_basic.yml | 6 +- setup.py | 1 + tests/openvino/test_modeling.py | 108 ++++++++++++---------- tests/openvino/utils_tests.py | 1 + 5 files changed, 66 insertions(+), 52 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index ba5b09ff81..bff5cb525f 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -35,7 +35,7 @@ jobs: pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime - name: Test with Pytest run: | - pytest tests/openvino/ --ignore test_modeling_basic + pytest tests/openvino/ --ignore test_modeling_basic --durations=0 - name: Test openvino-nightly run: | pip uninstall -y openvino diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml index effb99a84d..3135e6c004 100644 --- a/.github/workflows/test_openvino_basic.yml +++ b/.github/workflows/test_openvino_basic.yml @@ -25,7 +25,7 @@ jobs: # Testing lower and upper bound of supported Python versions # This also ensures that the test fails if dependencies break for Python 3.7 python-version: ["3.8", "3.11"] - transformers: ['transformers', 'git+https://github.com/huggingface/transformers.git'] + transformers: ['transformers'] optimum: ['optimum', 'git+https://github.com/huggingface/optimum.git'] runs-on: ubuntu-20.04 @@ -42,7 +42,7 @@ jobs: # Install openvino manually to prevent dependency conflicts when .[openvino] pins # optimum or transformers to a specific version # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages - pip install torch --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu pip install .[tests] openvino onnx onnxruntime ${{ matrix.optimum}} ${{ matrix.transformers }} - name: Pip freeze @@ -51,4 +51,4 @@ jobs: - name: Test with Pytest run: | pytest tests/openvino/test_modeling_basic.py - + RUN_SLOW=1 pytest tests/openvino/test_modeling.py -s -m "run_slow" --durations=0 \ No newline at end of file diff --git a/setup.py b/setup.py index e80d0ea448..a8c43f51d4 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ "auto-gptq", "transformers_stream_generator", "einops", + "tiktoken", ] QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 32fc255a1f..907c767310 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -20,6 +20,7 @@ from typing import Dict import numpy as np +import pytest import requests import timm import torch @@ -53,6 +54,7 @@ set_seed, ) from transformers.onnx.utils import get_preprocessor +from transformers.testing_utils import slow from utils_tests import MODEL_NAMES from optimum.intel import ( @@ -364,6 +366,8 @@ def test_compare_to_transformers(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForQuestionAnswering.from_pretrained(model_id, export=True) @@ -379,6 +383,8 @@ def test_pipeline(self, model_arch): del model gc.collect() + @pytest.mark.run_slow + @slow def test_metric(self): model_id = "distilbert-base-cased-distilled-squad" set_seed(SEED) @@ -431,6 +437,8 @@ def test_compare_to_transformers(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForTokenClassification.from_pretrained(model_id, export=True) @@ -481,6 +489,8 @@ def test_compare_to_transformers(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForFeatureExtraction.from_pretrained(model_id, export=True) @@ -526,9 +536,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "phi", "internlm2", "orion", + "falcon", ) GENERATION_LENGTH = 100 - IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "olmo", "orion") @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -553,37 +563,63 @@ def test_compare_to_transformers(self, model_arch): ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs) self.assertIsInstance(ov_model.config, PretrainedConfig) self.assertTrue(ov_model.use_cache) - self.assertEqual( - ov_model.stateful, self.IS_SUPPORT_STATEFUL and ov_model.config.model_type not in not_stateful - ) - set_seed(SEED) - transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) + self.assertEqual(ov_model.stateful, ov_model.config.model_type not in not_stateful) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) - if model_arch == "qwen": - transformers_model.to(torch.float32) - tokens = tokenizer( - "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None - ) - ov_outputs = ov_model(**tokens) + tokens = tokenizer("This is a sample output", return_tensors="pt") + ov_outputs = ov_model(**tokens) self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) self.assertTrue("past_key_values" in ov_outputs) self.assertIsInstance(ov_outputs.past_key_values, tuple) - is_stateful = ov_model.config.model_type not in not_stateful and self.IS_SUPPORT_STATEFUL + is_stateful = ov_model.config.model_type not in not_stateful self.assertEqual(ov_model.stateful, is_stateful) if is_stateful: self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) + + set_seed(SEED) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) + if model_arch == "qwen": + transformers_model.to(torch.float32) + with torch.no_grad(): transformers_outputs = transformers_model(**tokens) # Compare tensor outputs self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=1e-4)) + + # Qwen tokenizer does not support padding + if model_arch == "qwen": + return + + if model_arch != "chatglm": + tokenizer.pad_token_id = tokenizer.eos_token_id + # Compare batched generation + tokenizer.padding_side = "left" + tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + ov_model.generation_config.eos_token_id = None + transformers_model.generation_config.eos_token_id = None + ov_model.config.eos_token_id = None + transformers_model.config.eos_token_id = None + gen_config = GenerationConfig( + max_new_tokens=30, + min_new_tokens=30, + num_beams=3, + do_sample=False, + eos_token_id=None, + ) + + ov_outputs = ov_model.generate(**tokens, generation_config=gen_config) + transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) + self.assertTrue(torch.allclose(ov_outputs, transformers_outputs)) + del transformers_model del ov_model gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_pipeline(self, model_arch): model_kwargs = {} model_id = MODEL_NAMES[model_arch] @@ -613,35 +649,6 @@ def test_pipeline(self, model_arch): del model gc.collect() - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_multiple_inputs(self, model_arch): - model_id = MODEL_NAMES[model_arch] - set_seed(SEED) - if model_arch == "qwen": - self.skipTest("Qwen tokenizer does not support padding") - model_kwargs = {} - if model_arch in self.REMOTE_CODE_MODELS: - model_kwargs = { - "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), - "trust_remote_code": True, - } - model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, **model_kwargs) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) - tokenizer.pad_token = tokenizer.eos_token - texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"] - tokens = tokenizer(texts, padding=True, return_tensors="pt") - generation_config = GenerationConfig(encoder_no_repeat_ngram_size=0, max_new_tokens=20, num_beams=2) - outputs = model.generate(**tokens, generation_config=generation_config) - self.assertIsInstance(outputs, torch.Tensor) - self.assertEqual(outputs.shape[0], 3) - # test that generation result is reproducible - outputs2 = model.generate(**tokens, generation_config=generation_config) - self.assertIsInstance(outputs2, torch.Tensor) - self.assertEqual(outputs2.shape[0], 3) - self.assertTrue(torch.allclose(outputs2, outputs)) - del model - gc.collect() - def test_model_and_decoder_same_device(self): model_id = MODEL_NAMES["gpt2"] model = OVModelForCausalLM.from_pretrained(model_id, export=True) @@ -667,12 +674,11 @@ def test_compare_with_and_without_past_key_values(self): self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) - if self.IS_SUPPORT_STATEFUL: - model_stateful = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=True) - outputs_model_stateful = model_stateful.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - self.assertTrue(torch.equal(outputs_model_without_pkv, outputs_model_stateful)) + model_stateful = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=True) + outputs_model_stateful = model_stateful.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) + self.assertTrue(torch.equal(outputs_model_without_pkv, outputs_model_stateful)) del model_with_pkv del model_without_pkv @@ -851,6 +857,8 @@ def test_compare_to_transformers(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForImageClassification.from_pretrained(model_id, export=True) @@ -981,6 +989,8 @@ def test_pipeline(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_generate_utils(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True) @@ -1438,6 +1448,8 @@ def test_load_vanilla_transformers_which_is_not_supported(self): self.assertIn("only supports the tasks", str(context.exception)) @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_generate_utils(self, model_arch: str): model_id = MODEL_NAMES[model_arch] model = OVModelForVision2Seq.from_pretrained(model_id, export=True) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index e7f62f1f61..73224c81b2 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -42,6 +42,7 @@ "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "electra": "hf-internal-testing/tiny-random-electra", "gemma": "fxmarty/tiny-random-GemmaForCausalLM", + "falcon": "fxmarty/really-tiny-falcon-testing", "flaubert": "hf-internal-testing/tiny-random-flaubert", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", From 2b3f550bb822cd405850df4fa25eb0414975dddc Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 10 Apr 2024 17:27:33 +0800 Subject: [PATCH 11/25] Fix performance issue for Qwen dynamic causal mask (#651) * [Qwen]Fix performance issue with dynamic causal mask * [Qwen] fix code style --- optimum/exporters/openvino/model_patcher.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 4213d591d3..3649c163c6 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -531,9 +531,10 @@ def _qwen_attention_forward( value = value.permute(0, 2, 1, 3) if not self.use_cache_quantization and SUPPORT_SDPA: - causal_mask = registered_causal_mask[:, :, key.size(-2) - query.size(-2) : key.size(-2), : key.size(-2)] + # For performance, using constant tril to generate causal_mask + causal_mask = self.bias[:, :, key.size(-2) - query.size(-2) : key.size(-2), : key.size(-2)] if attention_mask is not None: - attention_mask = attention_mask.expand(-1, -1, causal_mask.size(2), -1).masked_fill( + attention_mask = attention_mask.expand(-1, -1, query.size(2), -1).masked_fill( ~causal_mask, torch.finfo(query.dtype).min ) else: @@ -578,8 +579,17 @@ def __init__( def __enter__(self): super().__enter__() + max_positions = self._model.config.seq_length for block in self._model.transformer.h: block.attn._orig_forward = block.attn.forward + # For performance, using constant tril to generate causal_mask + block.attn.register_buffer( + "bias", + torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( + 1, 1, max_positions, max_positions + ), + persistent=False, + ) block.attn.forward = types.MethodType(_qwen_attention_forward, block.attn) def __exit__(self, exc_type, exc_value, traceback): From ff792c278502a85444dd116413dbca71aa660599 Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Wed, 10 Apr 2024 11:34:28 +0200 Subject: [PATCH 12/25] Update pip install command in docs (#656) --- README.md | 6 +++--- docs/source/installation.mdx | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index c29a923745..78ca130145 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,9 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi | Accelerator | Installation | |:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------| -| [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"` | -| [OpenVINO](https://docs.openvino.ai) | `pip install --upgrade-strategy eager "optimum[openvino]"` | -| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade-strategy eager "optimum[ipex]"` | +| [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"` | +| [OpenVINO](https://docs.openvino.ai) | `pip install --upgrade --upgrade-strategy eager "optimum[openvino]"` | +| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade --upgrade-strategy eager "optimum[ipex]"` | The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index c29f5ceb95..aaab1b1f83 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -18,10 +18,10 @@ limitations under the License. To install the latest release of 🤗 Optimum Intel with the corresponding required dependencies, you can do respectively: -| Accelerator | Installation | -|:-----------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------| -| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"`| -| [Intel OpenVINO](https://docs.openvino.ai ) | `pip install --upgrade-strategy eager "optimum[openvino]"` | +| Accelerator | Installation | +|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------| +| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"`| +| [Intel OpenVINO](https://docs.openvino.ai ) | `pip install --upgrade --upgrade-strategy eager "optimum[openvino]"` | The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. From 402b9dba128429a045e4cb70d545d4e76cc4f5d4 Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Fri, 12 Apr 2024 09:37:12 +0200 Subject: [PATCH 13/25] Disable message about ONNX configs on export (#659) --- optimum/exporters/openvino/convert.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index ccc046ce55..01eb42614e 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -564,6 +564,7 @@ def export_from_model( kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name] ) + logging.disable(logging.INFO) export_config, models_and_export_configs = _get_submodels_and_export_configs( model=model, task=task, @@ -578,6 +579,7 @@ def export_from_model( legacy=False, exporter="openvino", ) + logging.disable(logging.NOTSET) if ov_config is None: if library_name == "diffusers": From 0540b1212dfc7d696dc015f3503eb51302bc3c94 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 12 Apr 2024 16:53:30 +0400 Subject: [PATCH 14/25] Fix sentence transformer model export with openvino (#660) * fix sentence transformer model export with openvino * Apply suggestions from code review Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/commands/export/openvino.py | 9 +++++++++ optimum/exporters/openvino/__main__.py | 8 ++++++++ optimum/exporters/openvino/convert.py | 2 ++ 3 files changed, 19 insertions(+) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 997ec44aa5..6c17a333ef 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -121,6 +121,14 @@ def parse_args_openvino(parser: "ArgumentParser"): help="Add converted tokenizer and detokenizer with OpenVINO Tokenizers", ) + optional_group.add_argument( + "--library", + type=str, + choices=["transformers", "diffusers", "timm", "sentence_transformers"], + default=None, + help=("The library on the model. If not provided, will attempt to infer the local checkpoint's library"), + ) + class OVExportCommand(BaseOptimumCLICommand): COMMAND = CommandInfo(name="openvino", help="Export PyTorch models to OpenVINO IR.") @@ -201,5 +209,6 @@ def run(self): ov_config=ov_config, stateful=not self.args.disable_stateful, convert_tokenizer=self.args.convert_tokenizer, + library_name=self.args.library # **input_shapes, ) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index dbea798f75..8b8cc09fc1 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -163,10 +163,18 @@ def main_export( original_task = task task = TasksManager.map_from_synonym(task) framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework) + library_name_is_not_provided = library_name is None library_name = TasksManager.infer_library_from_model( model_name_or_path, subfolder=subfolder, library_name=library_name ) + if library_name == "sentence_transformers" and library_name_is_not_provided: + logger.warning( + "Library name is not specified. There are multiple possible variants: `sentence_tenasformers`, `transformers`." + "`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers" + ) + library_name = "transformers" + if task == "auto": try: task = TasksManager.infer_task_from_model(model_name_or_path) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 01eb42614e..5dd7c7bd90 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -382,6 +382,8 @@ def ts_patched_forward(*args, **kwargs): sig = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.call) ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs} + if not ordered_dummy_inputs: + ordered_dummy_inputs = dummy_inputs ordered_input_names = list(inputs) flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values()) ov_model.validate_nodes_and_infer_types() From ff5d185738d2f5f93dc609870474b205d8136b99 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 15 Apr 2024 11:03:11 +0200 Subject: [PATCH 15/25] Introduce OVQuantizationConfig for nncf.quantize() parameters (#638) * Introduce OVQuantizationConfig for nncf.quantize() parameters * Ignored scope tweaks * Added **kwargs to quantization call. Added config serialization test. * Ignored scope changes. Tests pass. * Added documentation * Linters * Linters * Tweak ignored scope serialization * Added deprecation errors, tweak docs * Addressed minor comments * Make quantization config contain only serializable properties. * Small tweaks * Address comments * Fix ruff * Fix ruff 2 --- optimum/intel/__init__.py | 2 + optimum/intel/openvino/__init__.py | 2 +- optimum/intel/openvino/configuration.py | 321 ++++++++++++++------- optimum/intel/openvino/modeling_base.py | 12 +- optimum/intel/openvino/modeling_decoder.py | 15 +- optimum/intel/openvino/quantization.py | 283 ++++++++++-------- optimum/intel/openvino/trainer.py | 31 +- tests/openvino/test_quantization.py | 228 ++++++++++++++- tests/openvino/test_training.py | 4 +- 9 files changed, 654 insertions(+), 244 deletions(-) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 29abd00034..c097562651 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -124,6 +124,7 @@ "OVModelForVision2Seq", "OVModelForSequenceClassification", "OVModelForTokenClassification", + "OVQuantizationConfig", "OVWeightQuantizationConfig", "OVConfig", ] @@ -243,6 +244,7 @@ OVModelForSpeechSeq2Seq, OVModelForTokenClassification, OVModelForVision2Seq, + OVQuantizationConfig, OVWeightQuantizationConfig, ) diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 27a966865f..0cd7d8a029 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -43,7 +43,7 @@ from .trainer import OVTrainer -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig from .modeling import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 40a60bb58e..e75301729d 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -11,71 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import copy +import inspect +import logging from dataclasses import dataclass +from enum import Enum from typing import Any, Dict, List, Optional, Union +import nncf import torch +from nncf.quantization.advanced_parameters import OverflowFix from transformers import PretrainedConfig -from transformers.utils.quantization_config import QuantizationConfigMixin +from transformers.utils.quantization_config import QuantizationConfigMixin, QuantizationMethod from optimum.configuration_utils import BaseConfig -DEFAULT_QUANTIZATION_CONFIG = { - "algorithm": "quantization", - "preset": "mixed", - "overflow_fix": "disable", - "initializer": { - "range": {"num_init_samples": 300, "type": "mean_min_max"}, - "batchnorm_adaptation": {"num_bn_adaptation_samples": 0}, - }, - "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}}, - "ignored_scopes": [ - "{re}.*Embedding.*", - "{re}.*add___.*", - "{re}.*layer_norm_.*", - "{re}.*matmul_1", - "{re}.*__truediv__.*", - ], -} - -INT8_WEIGHT_COMPRESSION_CONFIG = { - "algorithm": "quantization", - "weights": { - "mode": "symmetric", - "bits": 8, - "target_scopes": [ - "{re}.*Embedding.*", - "{re}.*matmul_.*", - "{re}.*addmm_.*", - "{re}.*baddmm_.*", - "{re}.*linear_.*", - ], - "ignored_scopes": [ - "{re}.*conv_*", - ], - }, - "activations": { - "ignored_scopes": [ - "{re}.*add___.*", - "{re}.*__radd___.*", - "{re}.*layer_norm_.*", - "{re}.*__truediv__.*", - "{re}.*__mul___.*", - "{re}.*__rmul___.*", - "{re}.*tanh_.*", - "{re}.*pow_.*", - "{re}.*matmul_.*", - "{re}.*addmm_.*", - "{re}.*baddmm_.*", - "{re}.*linear_.*", - "{re}.*conv_.*", - ], - }, - "overflow_fix": "disable", -} - +logger = logging.getLogger(__name__) _DEFAULT_4BIT_CONFIGS = { "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5}, @@ -100,31 +52,75 @@ } +@dataclass +class OVQuantizationConfigBase(QuantizationConfigMixin): + """ + Base configuration class for quantization parameters + """ + + def __init__( + self, + ignored_scope: Optional[dict] = None, + num_samples: Optional[int] = None, + weight_only: Optional[bool] = None, + **kwargs, + ): + """ + Args: + ignored_scope (`dict`, *optional*): + An ignored scope that defines a list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. + num_samples (`int`, *optional*): + The maximum number of samples composing the calibration dataset. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. + """ + if isinstance(ignored_scope, nncf.IgnoredScope): + ignored_scope = ignored_scope.__dict__ + self.ignored_scope = ignored_scope + self.num_samples = num_samples + self.weight_only = weight_only + + def post_init(self): + try: + self.get_ignored_scope_instance() + except Exception as e: + raise ValueError( + f"Can't create an `IgnoredScope` object from the provided ignored scope dict: {self.ignored_scope}.\n{e}" + ) + if not (self.num_samples is None or isinstance(self.num_samples, int) and self.num_samples > 0): + raise ValueError(f"`num_samples` is expected to be a positive integer, but found: {self.num_samples}") + + def get_ignored_scope_instance(self) -> nncf.IgnoredScope: + if self.ignored_scope is None: + return nncf.IgnoredScope() + return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) + + class OVConfig(BaseConfig): CONFIG_NAME = "openvino_config.json" FULL_CONFIGURATION_FILE = "openvino_config.json" def __init__( self, - compression: Union[List[Dict], Dict, None] = None, input_info: Optional[List] = None, save_onnx_model: bool = False, - quantization_config: Optional[Union[QuantizationConfigMixin, Dict]] = None, + quantization_config: Optional[Union[dict, OVQuantizationConfigBase]] = None, dtype: Optional[str] = None, **kwargs, ): super().__init__() - self.compression = compression self.input_info = input_info self.save_onnx_model = save_onnx_model - self._enable_standard_onnx_export_option() self.optimum_version = kwargs.pop("optimum_version", None) - self.quantization_config = quantization_config or {} + if isinstance(quantization_config, dict): + quantization_config = self._quantization_config_from_dict(quantization_config) + self.quantization_config = quantization_config + self.compression = None # A field for backward-compatability of training-time compression parameters - if isinstance(quantization_config, QuantizationConfigMixin): - bits = self.quantization_config.bits - else: - bits = self.quantization_config.get("bits", None) + bits = ( + self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None + ) self.dtype = "int" + str(bits) if isinstance(bits, int) else dtype def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): @@ -137,41 +133,68 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): for name, value in model_inputs.items() ] - def save_pretrained(self, *args, **kwargs): - super().save_pretrained(*args, **kwargs) - - def _enable_standard_onnx_export_option(self): - # This method depends on self.save_onnx_model. - # save_onnx_model is defaulted to false so that the final model output is - # in OpenVINO IR to realize performance benefit in OpenVINO runtime. - # True value of save_onnx_model will save a model in onnx format. - if ( - isinstance(self.compression, dict) - and "algorithm" in self.compression - and self.compression["algorithm"] == "quantization" - ): - self.compression["export_to_onnx_standard_ops"] = self.save_onnx_model - elif isinstance(self.compression, list): - for i, algo_config in enumerate(self.compression): - if algo_config["algorithm"] == "quantization": - self.compression[i]["export_to_onnx_standard_ops"] = self.save_onnx_model + @staticmethod + def _quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: + wq_args = inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args + q_args = inspect.getfullargspec(OVQuantizationConfig.__init__).args + config_keys = quantization_config.keys() + matches_wq_config_signature = all(arg_name in wq_args for arg_name in config_keys) + matches_q_config_signature = all(arg_name in q_args for arg_name in config_keys) + if matches_wq_config_signature == matches_q_config_signature: + weight_only = quantization_config.get("weight_only", None) + if weight_only is None: + logger.warning( + "Can't determine type of OV quantization config. Please specify explicitly whether you intend to " + "run weight-only quantization or not with `weight_only` parameter. Creating an instance of " + "OVWeightQuantizationConfig." + ) + return OVWeightQuantizationConfig.from_dict(quantization_config) + matches_wq_config_signature = weight_only + + config_type = OVWeightQuantizationConfig if matches_wq_config_signature else OVQuantizationConfig + return config_type.from_dict(quantization_config) + + def _to_dict_safe(self, to_diff_dict: bool = False) -> Dict[str, Any]: + class ConfigStub: + def to_dict(self): + return None + + def to_diff_dict(self): + return None + + if self.quantization_config is None: + # Parent to_dict() implementation does not support quantization_config being None + self_copy = copy.deepcopy(self) + self_copy.quantization_config = ConfigStub() + result = self_copy.to_diff_dict() if to_diff_dict else self_copy.to_dict() + else: + result = super().to_diff_dict() if to_diff_dict else super().to_dict() + return result + + def to_dict(self) -> Dict[str, Any]: + return self._to_dict_safe(to_diff_dict=False) + + def to_diff_dict(self) -> Dict[str, Any]: + return self._to_dict_safe(to_diff_dict=True) + + +class OVQuantizationMethod(str, Enum): + DEFAULT = "default" @dataclass -class OVWeightQuantizationConfig(QuantizationConfigMixin): +class OVWeightQuantizationConfig(OVQuantizationConfigBase): """ This is a wrapper class about all possible attributes and features that you can play with a model that has been - loaded using `optimum-intel` api for quantization with NNCF. - + loaded using `optimum-intel` api for weight-only quantization with NNCF. For full model quantization please see + OVQuantizationConfig. Args: - bits (`int`, defaults to 8): The number of bits to quantize to. sym (`bool`, defaults to `False`): - Whether to use symetric quantization. - tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): + Whether to use symmetric quantization. + tokenizer (`str`, *optional*): The tokenizer used to process the dataset. You can pass either: - - A custom tokenizer object. - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. @@ -179,30 +202,37 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin): using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. dataset (`str or List[str]`, *optional*): The dataset used for data-aware compression or quantization with NNCF. You can provide your own dataset - in a list of strings or just use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs + in a list of strings or just use the one from the list ['wikitext','c4','c4-new','ptb','ptb-new'] for LLLMs or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models. + Alternatively, you can provide data objects via `calibration_dataset` argument + of `OVQuantizer.quantize()` method. ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM and the rest to INT8_ASYM). group_size (`int`, *optional*): The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. all_layers (`bool`, *optional*): - Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion. + Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision. sensitivity_metric (`str`, *optional*): The sensitivity metric for assigning quantization precision to layers. In order to preserve the accuracy of the model, the more sensitive layers receives a higher precision. ignored_scope (`dict`, *optional*): - An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization. + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. - + quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): + Weight compression method to apply. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building + the config from dictionary. """ def __init__( self, bits: int = 8, sym: bool = False, - tokenizer: Optional[Any] = None, + tokenizer: Optional[str] = None, dataset: Optional[Union[str, List[str]]] = None, ratio: float = 1.0, group_size: Optional[int] = None, @@ -210,8 +240,16 @@ def __init__( sensitivity_metric: Optional[str] = None, ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, + quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT, + weight_only: Optional[bool] = True, **kwargs, ): + if weight_only is False: + logger.warning( + "Trying to create an instance of `OVWeightQuantizationConfig` with `weight_only` being " + "False. Please check your configuration." + ) + super().__init__(ignored_scope, num_samples, True) self.bits = bits self.sym = sym self.tokenizer = tokenizer @@ -220,21 +258,25 @@ def __init__( self.ratio = ratio self.all_layers = all_layers self.sensitivity_metric = sensitivity_metric - self.ignored_scope = ignored_scope - self.num_samples = num_samples - self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release + self.quant_method = quant_method self.post_init() def post_init(self): r""" Safety checker that arguments are correct """ + super().post_init() if self.ratio is not None and not (0 <= self.ratio <= 1): raise ValueError("`ratio` must between 0 and 1.") if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: raise ValueError("`group_size` must be greater than 0 or equal to -1") + if not (self.dataset is None or isinstance(self.dataset, (str, list))): + raise ValueError( + f"Dataset must be a instance of either string or list of strings, but found {type(self.dataset)}. " + f"If you wish to provide a custom dataset please pass it via `calibration_dataset` argument." + ) if self.dataset is not None and isinstance(self.dataset, str): - llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"] + llm_datasets = ["wikitext", "c4", "c4-new", "ptb", "ptb-new"] stable_diffusion_datasets = [ "conceptual_captions", "laion/220k-GPT4Vision-captions-from-LIVIS", @@ -259,6 +301,87 @@ def post_init(self): f"For 8-bit quantization, `group_size` is expected to be set to -1, but was set to {self.group_size}" ) + if self.tokenizer is not None and not isinstance(self.tokenizer, str): + raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}") + + +@dataclass +class OVQuantizationConfig(OVQuantizationConfigBase): + def __init__( + self, + ignored_scope: Optional[dict] = None, + num_samples: Optional[int] = 300, + preset: nncf.QuantizationPreset = None, + model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, + fast_bias_correction: bool = True, + overflow_fix: OverflowFix = OverflowFix.DISABLE, + weight_only: Optional[bool] = False, + **kwargs, + ): + """ + Configuration class containing parameters related to model quantization with NNCF. Compared to weight + compression, during quantization both weights and activations are converted to lower precision. + For weight-only model quantization please see OVWeightQuantizationConfig. + Args: + ignored_scope (`dict`, *optional*): + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. + num_samples (`int`, *optional*): + The maximum number of samples composing the calibration dataset. + preset (`nncf.QuantizationPreset`, *optional*): + A preset controls the quantization mode (symmetric and asymmetric). + It can take the following values: + - `performance`: Symmetric quantization of weights and activations. + - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations. + Default value is None. In this case, `mixed` preset is used for `transformer` + model type otherwise `performance`. + model_type (`nncf.ModelType`, defaults to nncf.ModelType.TRANSFORMER): + Model type is needed to specify additional patterns in the model. Supported only `transformer` now. + fast_bias_correction (`bool`, defaults to True): + Whether to apply fast or full bias correction algorithm. + overflow_fix (`nncf.OverflowFix`, default to OverflowFix.DISABLE): + Parameter for controlling overflow fix setting. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building + the config from dictionary. + """ + if weight_only is True: + logger.warning( + "Trying to create an instance of `OVQuantizationConfig` with `weight_only` being True. " + "Please check your configuration." + ) + super().__init__(ignored_scope, num_samples, False) + # TODO: remove checks below once NNCF is updated to 2.10 + if isinstance(overflow_fix, str): + overflow_fix = OverflowFix(overflow_fix) + if isinstance(preset, str): + preset = nncf.QuantizationPreset(preset) + + self.preset = preset + self.model_type = model_type + self.fast_bias_correction = fast_bias_correction + self.overflow_fix = overflow_fix + self.post_init() + + def to_dict(self) -> Dict[str, Any]: + # TODO: remove code below once NNCF is updated to 2.10 + if isinstance(self.overflow_fix, Enum) or isinstance(self.preset, Enum): + overflow_fix_value = ( + None + if self.overflow_fix is None + else self.overflow_fix + if isinstance(self.overflow_fix, str) + else self.overflow_fix.value + ) + preset_value = ( + None if self.preset is None else self.preset if isinstance(self.preset, str) else self.preset.value + ) + self_copy = copy.deepcopy(self) + self_copy.overflow_fix = overflow_fix_value + self_copy.preset = preset_value + return self_copy.to_dict() + return super().to_dict() + def _check_default_4bit_configs(config: PretrainedConfig): return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index a6b8aacf43..d5b19bb28c 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -100,13 +100,21 @@ def __init__( self._openvino_config = OVConfig(quantization_config=quantization_config) @staticmethod - def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None): + def load_model( + file_name: Union[str, Path], + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + calibration_dataset: Optional = None, + ): """ Loads the model. Arguments: file_name (`str` or `Path`): The path of the model ONNX or XML file. + quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*): + Quantization config to apply after model is loaded. + calibration_dataset (`nncf.Dataset`, *optional*): + Optional nncf.Dataset to feed to model weight compression when quantization config is provided. """ def fix_op_names_duplicates(model: openvino.runtime.Model): @@ -135,7 +143,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model): from optimum.intel.openvino.quantization import _weight_only_quantization - model = _weight_only_quantization(model, quantization_config) + model = _weight_only_quantization(model, quantization_config, calibration_dataset=calibration_dataset) return model diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 4b156eda9e..44137186e2 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -572,7 +572,7 @@ def _from_pretrained( from_onnx: bool = False, local_files_only: bool = False, load_in_8bit: bool = False, - quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): model_path = Path(model_id) @@ -596,7 +596,12 @@ def _from_pretrained( quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) load_in_4bit = quantization_config.bits == 4 if quantization_config else False - model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config) + calibration_dataset = kwargs.get("calibration_dataset", None) + model = cls.load_model( + model_cache_path, + quantization_config=None if load_in_4bit else quantization_config, + calibration_dataset=calibration_dataset, + ) model_type = config.model_type.replace("_", "-") if model_type == "bloom": @@ -632,7 +637,7 @@ def _from_pretrained( f"For the given model, we recommend the following `quantization_config` : {default_config}" ) - if isinstance(quantization_config.dataset, str): + if calibration_dataset is None and isinstance(quantization_config.dataset, str): tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id) from optimum.gptq.data import get_dataset, prepare_dataset @@ -644,9 +649,9 @@ def _from_pretrained( dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) quantization_config = copy.deepcopy(quantization_config) - quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) + calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) - _weight_only_quantization(model, quantization_config) + _weight_only_quantization(model, quantization_config, calibration_dataset) return causal_model diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index a2579611a4..33985dbe6e 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -18,13 +18,14 @@ import os from collections import deque from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +import datasets import nncf import openvino import torch import transformers -from nncf import CompressWeightsMode, IgnoredScope, SensitivityMetric +from nncf import CompressWeightsMode, SensitivityMetric from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters from nncf.torch import register_module from nncf.torch.initialization import PTInitializingDataLoader @@ -46,7 +47,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available from ..utils.modeling_utils import get_model_device -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( MAX_ONNX_OPSET, @@ -203,39 +204,52 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs): def quantize( self, - calibration_dataset: "Dataset" = None, + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, save_directory: Union[str, Path] = None, ov_config: OVConfig = None, file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, + weights_only: bool = None, **kwargs, ): """ Quantize a model given the optimization specifications defined in `quantization_config`. Args: - calibration_dataset (`datasets.Dataset`): - The dataset to use for the calibration step. + calibration_dataset (`datasets.Dataset` or `nncf.Dataset` or `Iterable`, *optional*): + A collection of data samples to use for quantization calibration. Is optional for weight-only + quantization and is required for full quantization. save_directory (`Union[str, Path]`): The directory where the quantized model should be saved. - quantization_config (`OVConfig`, *optional*): - The configuration containing the parameters related to quantization. + ov_config (`OVConfig`, *optional*): + The configuration containing the parameters related to quantization. If not provided, 8-bit symmetric + weight-only quantization will be applied. file_name (`str`, *optional*): The model file name to use when saving the model. Overwrites the default file name `"model.onnx"`. - batch_size (`int`, defaults to 8): + batch_size (`int`, defaults to 1): The number of calibration samples to load per batch. data_collator (`DataCollator`, *optional*): The function to use to form a batch from a list of elements of the calibration dataset. remove_unused_columns (`bool`, defaults to `True`): - Whether or not to remove the columns unused by the model forward method. - weights_only (`bool`, defaults to `False`): + Whether to remove the columns unused by the model forward method. + weights_only (`bool`, *optional*): + Being deprecated. Compress weights to integer precision (8-bit by default) while keeping activations floating-point. Fits best for LLM footprint reduction and performance acceleration. Examples: + ```python + >>> from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM + >>> from transformers import AutoModelForCausalLM + >>> model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") + >>> quantizer = OVQuantizer.from_pretrained(model, task="text-generation") + >>> ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig()) + >>> quantizer.quantize(ov_config=ov_config, save_directory="./quantized_model") + >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") + ``` + ```python >>> from optimum.intel.openvino import OVQuantizer, OVModelForSequenceClassification >>> from transformers import AutoModelForSequenceClassification @@ -243,53 +257,46 @@ def quantize( >>> # or >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") >>> quantizer = OVQuantizer.from_pretrained(model, task="text-classification") - >>> quantizer.quantize(calibration_dataset=calibration_dataset, save_directory="./quantized_model") + >>> ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + >>> quantizer.quantize(calibration_dataset=dataset, ov_config=ov_config, save_directory="./quantized_model") >>> optimized_model = OVModelForSequenceClassification.from_pretrained("./quantized_model") ``` - - ```python - >>> from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM - >>> from transformers import AutoModelForCausalLM - >>> model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") - >>> quantizer = OVQuantizer.from_pretrained(model, task="text-generation") - >>> quantizer.quantize(save_directory="./quantized_model", weights_only=True) - >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") - ``` """ + # TODO: deprecate weights_only argument + if weights_only is not None: + logger.warning( + "`weights_only` argument is deprecated. In the future please provide `ov_config.quantization_config` " + "as an instance of OVWeightQuantizationConfig for weight-only compression or as an instance of " + "OVQuantizationConfig for full model quantization." + ) + if save_directory is None: # TODO : can be set to self.model.config.name_or_path for OVModels when not provided raise ValueError("`save_directory` needs to be specified") - if weights_only: - if calibration_dataset is not None: - logger.warning( - "`calibration_dataset` was provided but will not be used as `weights_only` is set to `True`." - ) - else: - if calibration_dataset is None: - raise ValueError( - "`calibration_dataset` is needed to compute the activations range during the calibration step and was not provided. " - "In case you only want to apply quantization on the weights, please set `weights_only=True`." - ) - quantization_config = kwargs.pop("quantization_config", None) - if quantization_config is not None: - logger.warning( - "The argument `quantization_config` is deprecated, and will be removed in optimum-intel v1.6.0, please use `ov_config` instead" - ) - ov_config = ov_config or quantization_config - if ov_config is not None: - if not isinstance(ov_config, OVConfig): - raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + if ov_config is None: + ov_config = OVConfig() + if not isinstance(ov_config, OVConfig): + raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + quantization_config = ov_config.quantization_config + if quantization_config is None: + if weights_only is None or weights_only is True: + if weights_only is None: + logger.info( + "`quantization_config` was not provided, 8-bit asymmetric weight quantization will be applied." + ) + ov_config.quantization_config = OVWeightQuantizationConfig(bits=8) + else: + ov_config.quantization_config = OVQuantizationConfig() if isinstance(self.model, OVBaseModel): self._quantize_ovbasemodel( - calibration_dataset, + ov_config, save_directory, + calibration_dataset, batch_size, data_collator, remove_unused_columns, - weights_only, - ov_config, **kwargs, ) @@ -299,84 +306,97 @@ def quantize( "To convert a PyTorch model to OpenVINO, you can set `export=True` when loading your model as `OVModelForXxx.from_pretrained(..., export=True)`" ) self._quantize_torchmodel( - calibration_dataset, + ov_config, save_directory, + calibration_dataset, file_name, batch_size, data_collator, remove_unused_columns, - weights_only, + **kwargs, ) else: raise TypeError(f"Unsupported model type: {type(self.model)}") def _quantize_ovbasemodel( self, - calibration_dataset: "Dataset", + ov_config: OVConfig, save_directory: Union[str, Path], + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, - ov_config: OVConfig = None, **kwargs, ): save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) - if weights_only: - q_config = getattr(ov_config, "quantization_config", None) - # Use default 8-bit compression if not provided - q_config = q_config or OVWeightQuantizationConfig(bits=8, sym=True) - _weight_only_quantization(self.model.model, q_config) - + quantization_config = ov_config.quantization_config + if isinstance(quantization_config, OVWeightQuantizationConfig): + _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) self.model.save_pretrained(save_directory) + ov_config.save_pretrained(save_directory) return + if not isinstance(quantization_config, OVQuantizationConfig): + raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") - calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=calibration_dataset, - batch_size=batch_size, - remove_unused_columns=remove_unused_columns, - data_collator=data_collator, - ) + if isinstance(calibration_dataset, nncf.Dataset): + quantization_dataset = calibration_dataset + elif isinstance(calibration_dataset, datasets.Dataset): + calibration_dataloader = self._get_calibration_dataloader( + calibration_dataset=calibration_dataset, + batch_size=batch_size, + remove_unused_columns=remove_unused_columns, + data_collator=data_collator, + ) - if self.model.export_feature == "text-generation" and self.model.use_cache: - # Prefeth past_key_values - self.model.update_pkv_precision(True) - self.model.compile() - subset_size = kwargs.get("subset_size", 300) - collected_inputs = [] - - self.model.request = InferRequestWrapper(self.model.request, collected_inputs) - for _, data in enumerate(calibration_dataloader): - self.model.generate(**data, max_new_tokens=1) - if len(collected_inputs) >= subset_size: - break - self.model.request = self.model.request.request - calibration_dataloader = collected_inputs + if self.model.export_feature == "text-generation" and self.model.use_cache: + # Prefetch past_key_values + self.model.update_pkv_precision(True) + self.model.compile() + collected_inputs = [] + + self.model.request = InferRequestWrapper(self.model.request, collected_inputs) + try: + for data in calibration_dataloader: + self.model.generate(**data, max_new_tokens=1) + if len(collected_inputs) >= quantization_config.num_samples: + break + finally: + self.model.request = self.model.request.request + quantization_dataset = nncf.Dataset(collected_inputs) + else: + quantization_dataset = nncf.Dataset(calibration_dataloader) + else: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") + quantization_dataset = nncf.Dataset(calibration_dataset) # Actual model quantization - quantization_dataset = nncf.Dataset(calibration_dataloader) quantized_model = nncf.quantize( self.model.model, quantization_dataset, - model_type=nncf.ModelType.TRANSFORMER if not kwargs.get("model_type") else kwargs.get("model_type"), - fast_bias_correction=kwargs.get("fast_bias_correction", True), + subset_size=quantization_config.num_samples, + ignored_scope=quantization_config.get_ignored_scope_instance(), + model_type=quantization_config.model_type, + preset=quantization_config.preset, + fast_bias_correction=quantization_config.fast_bias_correction, + advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), **kwargs, ) self.model.model = quantized_model self.model.save_pretrained(save_directory) + ov_config.save_pretrained(save_directory) def _quantize_torchmodel( self, - calibration_dataset: "Dataset", + ov_config: OVConfig, save_directory: Union[str, Path], + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, - save_onnx_model: bool = False, **kwargs, ): self._set_task() @@ -394,6 +414,7 @@ def _quantize_torchmodel( model_type=model_type, ) + save_onnx_model = ov_config.save_onnx_model onnx_file_name = ( ONNX_WEIGHTS_NAME if file_name is None and save_onnx_model else Path(ov_file_name).with_suffix(".onnx") ) @@ -412,7 +433,8 @@ def _quantize_torchmodel( stateful = ensure_stateful_is_available() and ensure_export_task_support_stateful(task) - if weights_only: + quantization_config = ov_config.quantization_config + if isinstance(quantization_config, OVWeightQuantizationConfig): if stateful: # patch model before weight compression model = patch_model_with_bettertransformer(model) @@ -426,6 +448,8 @@ def _quantize_torchmodel( nncf.compress_weights(model, dataset=nncf.Dataset([dummy_inputs])) else: + if not isinstance(quantization_config, OVQuantizationConfig): + raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") if stateful: logger.warn( "Quantization algorithm does not support optimized stateful models. " @@ -433,19 +457,29 @@ def _quantize_torchmodel( ) stateful = False - calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=calibration_dataset, - batch_size=batch_size, - remove_unused_columns=remove_unused_columns, - data_collator=data_collator, - ) - - quantization_dataset = nncf.Dataset(calibration_dataloader) + if isinstance(calibration_dataset, nncf.Dataset): + quantization_dataset = calibration_dataset + elif isinstance(calibration_dataset, datasets.Dataset): + calibration_dataloader = self._get_calibration_dataloader( + calibration_dataset=calibration_dataset, + batch_size=batch_size, + remove_unused_columns=remove_unused_columns, + data_collator=data_collator, + ) + quantization_dataset = nncf.Dataset(calibration_dataloader) + else: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") + quantization_dataset = nncf.Dataset(calibration_dataset) model = nncf.quantize( model, quantization_dataset, - model_type=nncf.ModelType.TRANSFORMER if not kwargs.get("model_type") else kwargs.get("model_type"), - fast_bias_correction=kwargs.get("fast_bias_correction", True), + subset_size=quantization_config.num_samples, + ignored_scope=quantization_config.get_ignored_scope_instance(), + model_type=quantization_config.model_type, + preset=quantization_config.preset, + fast_bias_correction=quantization_config.fast_bias_correction, + advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), **kwargs, ) @@ -472,6 +506,8 @@ def _quantize_torchmodel( except FileNotFoundError: pass + ov_config.save_pretrained(save_directory) + @staticmethod def _save_pretrained(model: openvino.runtime.Model, output_path: str): compress_quantize_weights_transformation(model) @@ -503,7 +539,7 @@ def get_calibration_dataset( preprocess_batch: bool = True, use_auth_token: bool = False, cache_dir: Optional[str] = None, - ) -> "Dataset": + ) -> datasets.Dataset: """ Create the calibration `datasets.Dataset` to use for the post-training static quantization calibration step. @@ -580,18 +616,33 @@ def _remove_unused_columns(self, dataset: "Dataset"): def _weight_only_quantization( - model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict] + model: openvino.runtime.Model, + quantization_config: Union[OVWeightQuantizationConfig, Dict], + calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, ) -> openvino.runtime.Model: config = quantization_config if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) - dataset = config.dataset - - if config.dataset is not None and isinstance(config.dataset, str): - tokenizer = config.tokenizer - if isinstance(tokenizer, str): - tokenizer = AutoTokenizer.from_pretrained(tokenizer) + if config.dataset is not None and calibration_dataset is not None: + logger.info( + "Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only " + "quantization. Will rely on `calibration_dataset`." + ) + dataset = None + if calibration_dataset is not None: + if isinstance(calibration_dataset, datasets.Dataset): + raise ValueError( + "Providing calibration dataset as an instance of `datasets.Dataset` for OV weight-only " + "quantization is not supported. Please provide it as `nncf.Dataset` or as iterable of " + "model inputs." + ) + elif isinstance(calibration_dataset, nncf.Dataset): + dataset = calibration_dataset + else: + dataset = nncf.Dataset(calibration_dataset) + elif config.dataset is not None and isinstance(config.dataset, str): + tokenizer = AutoTokenizer.from_pretrained(config.tokenizer) from optimum.gptq.data import get_dataset, prepare_dataset @@ -603,10 +654,6 @@ def _weight_only_quantization( if isinstance(config.sensitivity_metric, str): sensitivity_metric = getattr(SensitivityMetric, config.sensitivity_metric.upper()) - ignored_scope = None - if isinstance(config.ignored_scope, dict): - ignored_scope = IgnoredScope(**config.ignored_scope) - if config.bits == 8: mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM else: @@ -619,10 +666,10 @@ def _weight_only_quantization( group_size=config.group_size, all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, - # awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0 - ignored_scope=ignored_scope, + # awq=config.quant_method == QuantizationMethod.AWQ, # TODO : enable from nncf v2.9.0 + ignored_scope=config.get_ignored_scope_instance(), dataset=dataset, - # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 + # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 ) @@ -691,23 +738,23 @@ def _hybrid_quantization( """ ops_to_compress = _collect_ops_with_weights(model) - ignored_scope = quantization_config.ignored_scope if isinstance(quantization_config.ignored_scope, dict) else {} - ptq_ignored_scope = nncf.IgnoredScope(**ignored_scope) - ptq_ignored_scope.names += ops_to_compress - - wc_quantization_config = copy.deepcopy(quantization_config) - wc_quantization_config.ignored_scope = ignored_scope - wc_quantization_config.ignored_scope["types"] = ignored_scope.get("types", []) + ["Convolution"] - compressed_model = _weight_only_quantization(model, wc_quantization_config) + wc_config = copy.deepcopy(quantization_config) + wc_config.ignored_scope = wc_config.ignored_scope or {} + wc_config.ignored_scope["types"] = wc_config.ignored_scope.get("types", []) + ["Convolution"] + compressed_model = _weight_only_quantization(model, wc_config) + ptq_ignored_scope = quantization_config.get_ignored_scope_instance() + ptq_ignored_scope.names += ops_to_compress subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 quantized_model = nncf.quantize( model=compressed_model, calibration_dataset=nncf.Dataset(dataset), model_type=nncf.ModelType.TRANSFORMER, ignored_scope=ptq_ignored_scope, - # The SQ algo should be disabled for MatMul nodes because their weights are already compressed - advanced_parameters=nncf.AdvancedQuantizationParameters(AdvancedSmoothQuantParameters(matmul=-1)), + # SQ algo should be disabled for MatMul nodes because their weights are already compressed + advanced_parameters=nncf.AdvancedQuantizationParameters( + smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1) + ), subset_size=subset_size, ) return quantized_model diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index 25b7f35d07..873b0909c8 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -88,7 +88,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import is_transformers_version -from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig +from .configuration import OVConfig from .quantization import OVDataLoader from .training_args import OVTrainingArguments from .utils import ( @@ -140,6 +140,25 @@ NNCF_LOG_FILE_NAME = "nncf_output.log" +DEFAULT_QUANTIZATION_CONFIG = { + "algorithm": "quantization", + "preset": "mixed", + "overflow_fix": "disable", + "initializer": { + "range": {"num_init_samples": 300, "type": "mean_min_max"}, + "batchnorm_adaptation": {"num_bn_adaptation_samples": 0}, + }, + "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}}, + "ignored_scopes": [ + "{re}.*Embedding.*", + "{re}.*add___.*", + "{re}.*layer_norm_.*", + "{re}.*matmul_1", + "{re}.*__truediv__.*", + ], +} + + def _onnx_export_nncf_model(model: NNCFNetwork, config: OnnxConfig, output: Union[str, io.BytesIO], opset: int = None): # TODO: remove it when fix controller.strip(copy=True) behavior signature = inspect.signature(model.forward) @@ -232,6 +251,16 @@ def __init__( if self.ov_config is not None: if self.ov_config.compression is None: self.ov_config.compression = DEFAULT_QUANTIZATION_CONFIG + if ( + isinstance(self.ov_config.compression, dict) + and "algorithm" in self.ov_config.compression + and self.ov_config.compression["algorithm"] == "quantization" + ): + self.ov_config.compression["export_to_onnx_standard_ops"] = self.ov_config.save_onnx_model + elif isinstance(self.ov_config.compression, list): + for i, algo_config in enumerate(self.ov_config.compression): + if algo_config["algorithm"] == "quantization": + self.ov_config.compression[i]["export_to_onnx_standard_ops"] = self.ov_config.save_onnx_model if self.args.do_train: self._set_task() diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 0e307fb036..b22d5e3955 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -15,15 +15,19 @@ # ruff: noqa import itertools +import logging import tempfile import unittest from collections import defaultdict +from enum import Enum from functools import partial +from typing import List, Union import evaluate import numpy as np import torch from datasets import load_dataset +from nncf.quantization.advanced_parameters import OverflowFix from parameterized import parameterized import openvino.runtime as ov import nncf @@ -37,6 +41,7 @@ TrainingArguments, default_data_collator, ) +from transformers.utils.quantization_config import QuantizationMethod from optimum.intel import ( OVConfig, @@ -55,8 +60,10 @@ OVStableDiffusionXLPipeline, OVQuantizer, OVTrainer, + OVQuantizationConfig, OVWeightQuantizationConfig, ) +from optimum.intel.openvino.configuration import OVQuantizationMethod, OVQuantizationConfigBase from optimum.intel.openvino.quantization import InferRequestWrapper from optimum.intel.utils.import_utils import is_openvino_version @@ -98,7 +105,13 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="train", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, file_name=file_name) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize( + save_directory=tmp_dir, + calibration_dataset=calibration_dataset, + file_name=file_name, + ov_config=ov_config, + ) model = model_cls.from_pretrained(tmp_dir, file_name=file_name) # TODO: uncomment once move to a newer version of NNCF which has some fixes (addmm, baddmm) @@ -110,6 +123,10 @@ def preprocess_function(examples, tokenizer): outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): task = model_cls.export_feature @@ -134,7 +151,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="train", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) model = model_cls.from_pretrained(tmp_dir) @@ -146,6 +164,10 @@ def preprocess_function(examples, tokenizer): outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + class OVWeightCompressionTest(unittest.TestCase): # TODO : add models @@ -210,7 +232,7 @@ class OVWeightCompressionTest(unittest.TestCase): ratio=0.8, sensitivity_metric="mean_activation_magnitude", dataset="ptb", - awq=True, + quant_method=QuantizationMethod.AWQ, ), 14, ), @@ -251,7 +273,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -261,6 +283,15 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + original_config_as_dict = OVWeightQuantizationConfig().to_dict() + for k in original_config_as_dict.keys(): + v = original_config_as_dict[k] + if isinstance(v, Enum): + original_config_as_dict[k] = v.value + self.assertEqual(original_config_as_dict, loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature @@ -272,7 +303,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -282,6 +313,10 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS) def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8, expected_int4): task = model_cls.export_feature @@ -297,7 +332,6 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, ratio=0.8)) quantizer.quantize( save_directory=tmp_dir, - weights_only=True, ov_config=ov_config, ) model = model_cls.from_pretrained(tmp_dir) @@ -310,6 +344,10 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above") def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, expected_pt_int8, expected_ov_int8): @@ -322,7 +360,7 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -332,6 +370,10 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False) @@ -401,17 +443,18 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.quantization_config.bits, 4) self.assertEqual(openvino_config.dtype, "int4") if model_id == "facebook/opt-125m": for key, value in self.DEFAULT_INT4_CONFIG.items(): - self.assertEqual(value, openvino_config.quantization_config[key]) + self.assertEqual(value, getattr(openvino_config.quantization_config, key)) @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( self, model_cls, model_id, quantization_config, expected_ov_int4 ): with tempfile.TemporaryDirectory() as tmp_dir: + quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -422,7 +465,7 @@ def test_ovmodel_4bit_auto_compression_with_config( model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.quantization_config.bits, 4) self.assertEqual(openvino_config.dtype, "int4") @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS) @@ -453,9 +496,8 @@ def transform_fn(data, tokenizer): model = model_cls.from_pretrained( model_id, export=True, - quantization_config=OVWeightQuantizationConfig( - bits=4, sym=True, group_size=-1, ratio=0.8, dataset=quantization_dataset - ), + quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8), + calibration_dataset=quantization_dataset, ) _, num_int8, num_int4 = get_num_quantized_nodes(model) @@ -545,7 +587,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): "all_layers": None, "sensitivity_metric": None, "dataset": None, - "ignored_scope": None, + "ignored_scope": nncf.IgnoredScope(), } compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) @@ -571,7 +613,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="test", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir) @@ -586,6 +629,10 @@ def preprocess_function(examples, tokenizer): except RuntimeError: self.fail("Loading BERT QA model a second time failed") + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_ovmodel_static_quantization(self, model_name): def preprocess_function(examples, tokenizer): @@ -604,7 +651,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="test", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir) @@ -619,6 +667,10 @@ def preprocess_function(examples, tokenizer): except RuntimeError: self.fail("Loading BERT QA model a second time failed") + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + class OVTrainerTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 50, 38),) @@ -666,6 +718,150 @@ def compute_metrics(p): self.assertTrue("logits" in outputs) +class OVQuantizationConfigTest(unittest.TestCase): + QUANTIZATION_CONFIGS = ( + (None,), + (OVWeightQuantizationConfig(),), + ( + OVWeightQuantizationConfig( + bits=8, + sym=True, + ), + ), + ( + OVWeightQuantizationConfig( + dataset="wikitext", + bits=4, + ignored_scope={"names": ["op_name"]}, + sym=False, + tokenizer="dbmdz/bert-base-german-cased", + ratio=1.0, + group_size=128, + all_layers=True, + sensitivity_metric="mean_activation_magnitude", + num_samples=100, + quant_method=OVQuantizationMethod.DEFAULT, + ), + ), + (OVWeightQuantizationConfig(dataset=["hello world", "i'm alive"]),), + ( + OVQuantizationConfig( + ignored_scope={"names": ["op_name"]}, + num_samples=100, + preset=nncf.QuantizationPreset.MIXED, + model_type=nncf.ModelType.TRANSFORMER, + fast_bias_correction=True, + overflow_fix=OverflowFix.DISABLE, + ), + ), + (OVQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])),), + ) + + QUANTIZATION_CONFIG_DICTS = ( + (dict(bits=8, sym=True), OVWeightQuantizationConfig, None), + ( + dict( + dataset="wikitext", + bits=4, + ignored_scope={"names": ["op_name"]}, + sym=False, + tokenizer="dbmdz/bert-base-german-cased", + ratio=1.0, + group_size=128, + all_layers=True, + sensitivity_metric="mean_activation_magnitude", + num_samples=100, + quant_method=OVQuantizationMethod.DEFAULT, + ), + OVWeightQuantizationConfig, + None, + ), + (dict(), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + ( + dict(ignored_scope={"names": ["op_name"]}), + OVWeightQuantizationConfig, + "Can't determine type of OV quantization config", + ), + (dict(num_samples=100), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + (dict(abc="def"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + ( + dict(bits=8, fast_bias_correction=True), + OVWeightQuantizationConfig, + "Can't determine type of OV quantization config", + ), + (dict(model_type=nncf.ModelType.TRANSFORMER), OVQuantizationConfig, None), + ( + dict( + ignored_scope={"names": ["op_name"]}, + num_samples=100, + preset=nncf.QuantizationPreset.MIXED, + model_type=nncf.ModelType.TRANSFORMER, + fast_bias_correction=True, + overflow_fix=OverflowFix.DISABLE, + ), + OVQuantizationConfig, + None, + ), + (dict(weight_only=True), OVWeightQuantizationConfig, None), + (dict(weight_only=False), OVQuantizationConfig, None), + (dict(abc="def", weight_only=False), OVQuantizationConfig, None), + (dict(abc="def", weight_only=True), OVWeightQuantizationConfig, None), + (dict(bits=8, fast_bias_correction=True, weight_only=True), OVWeightQuantizationConfig, None), + (dict(bits=8, fast_bias_correction=True, weight_only=False), OVQuantizationConfig, None), + (dict(bits=8, sym=True, weight_only=False), OVWeightQuantizationConfig, "Please check your configuration"), + ( + dict(model_type=nncf.ModelType.TRANSFORMER, weight_only=True), + OVQuantizationConfig, + "Please check your configuration", + ), + ) + + @parameterized.expand(QUANTIZATION_CONFIGS) + def test_config_serialization(self, quantization_config: OVQuantizationConfigBase): + def str_to_enum(enum_cls, value): + for k, v in enum_cls.__members__.items(): + if getattr(enum_cls, k).value == value: + return v + raise ValueError(f"Could not convert string {value} to enum value of type {enum_cls}") + + ov_config = OVConfig(quantization_config=quantization_config) + with tempfile.TemporaryDirectory() as tmp_dir: + ov_config.save_pretrained(tmp_dir) + loaded_ov_config = OVConfig.from_pretrained(tmp_dir) + + if quantization_config is None: + self.assertEqual(loaded_ov_config.quantization_config, None) + return + for key, value in loaded_ov_config.quantization_config.to_dict().items(): + initial_value = getattr(ov_config.quantization_config, key) + if key == "preset" or key == "overflow_fix": + # TODO: remove once NNCF is updated to 2.10 + if getattr(quantization_config, key) is not None: + self.assertTrue(isinstance(value, str)) + if key == "preset": + value = str_to_enum(nncf.QuantizationPreset, value) + else: + value = str_to_enum(OverflowFix, value) + self.assertEqual(value, initial_value) + + @parameterized.expand(QUANTIZATION_CONFIG_DICTS) + def test_config_from_dict(self, quantization_config: dict, config_type: type, warning_log: Union[str, None]): + from optimum.intel.openvino.configuration import logger as configuration_logger + + if warning_log is not None: + with self.assertLogs(configuration_logger, logging.WARN) as cm: + ov_config = OVConfig(quantization_config=quantization_config) + self.assertTrue(any(warning_log in log for log in cm.output)) + else: + ov_config = OVConfig(quantization_config=quantization_config) + self.assertIsInstance(ov_config.quantization_config, config_type) + for k, v in quantization_config.items(): + if k == "weight_only" and warning_log == "Please check your configuration": + continue + if hasattr(ov_config.quantization_config, k): + self.assertEqual(getattr(ov_config.quantization_config, k), v) + + class InferRequestWrapperTest(unittest.TestCase): MODEL_ID = ("openai/whisper-tiny.en",) APPLY_CACHING = (False, True) diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py index 80298faf2b..db443c6de2 100644 --- a/tests/openvino/test_training.py +++ b/tests/openvino/test_training.py @@ -45,14 +45,14 @@ from transformers.utils import WEIGHTS_NAME from optimum.intel.openvino import OVTrainingArguments -from optimum.intel.openvino.configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig +from optimum.intel.openvino.configuration import OVConfig from optimum.intel.openvino.modeling import ( OVModel, OVModelForAudioClassification, OVModelForImageClassification, OVModelForSequenceClassification, ) -from optimum.intel.openvino.trainer import OVTrainer +from optimum.intel.openvino.trainer import DEFAULT_QUANTIZATION_CONFIG, OVTrainer from optimum.intel.openvino.utils import OV_XML_FILE_NAME From bce36d2938dabdfe757352c87735fcba6d2fe969 Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Wed, 17 Apr 2024 10:13:15 +0200 Subject: [PATCH 16/25] Fix model caching for diffusion models and multiple GPUs (#665) --- optimum/intel/openvino/modeling_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index eb407b4cd1..1e562749b2 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -671,7 +671,7 @@ def _compile(self): if ( "CACHE_DIR" not in self.ov_config.keys() and not str(self._model_dir).startswith(gettempdir()) - and self.device.lower().split(":")[0] == "gpu" + and "gpu" in self.device.lower() ): self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache") From 658eef18f788ddb3cdca5350cbbe349153ff25b8 Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Wed, 17 Apr 2024 14:09:14 +0100 Subject: [PATCH 17/25] Fix dataset in stable_diffusion_hybrid_quantization notebook (#667) --- ...stable_diffusion_hybrid_quantization.ipynb | 104 +++++++++++------- 1 file changed, 63 insertions(+), 41 deletions(-) diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb index b5d9ae7001..41969b162a 100644 --- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb +++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "id": "dffab375-a730-4015-8d17-360b76a0718d", "metadata": {}, "outputs": [], @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "0407fc92-c052-47b7-8721-01836adf3b54", "metadata": { "execution": { @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "id": "c32f9a76-414b-43d9-9769-af131223f1c1", "metadata": { "execution": { @@ -125,14 +125,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "id": "92a3f434", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "caab335ab7b146bba41c81e8688759f9", + "model_id": "31afdad1b284494aa51e668f5d8fc5c8", "version_major": 2, "version_minor": 0 }, @@ -147,7 +147,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'image': , 'filename': 'COCO_train2014_000000494175.jpg', 'cocoid': 494175, 'caption': 'A crowded arena filled with people and confetti.'}\n" + "{'image': , 'filename': 'COCO_train2014_000000494175.jpg', 'cocoid': 494175, 'caption': 'A crowded arena filled with people and confetti.'}\n" ] } ], @@ -158,24 +158,18 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "8be68958-ce5e-4cc6-b8e7-2867feaf084b", - "metadata": { - "execution": { - "iopub.execute_input": "2022-12-04T14:40:31.358230Z", - "iopub.status.busy": "2022-12-04T14:40:31.358053Z", - "iopub.status.idle": "2022-12-04T14:40:31.360666Z", - "shell.execute_reply": "2022-12-04T14:40:31.360301Z", - "shell.execute_reply.started": "2022-12-04T14:40:31.358218Z" - }, - "tags": [] - }, + "execution_count": 5, + "id": "1036fe23", + "metadata": {}, "outputs": [], "source": [ "def preprocess_fn(example):\n", " return {\"prompt\": example[\"caption\"]}\n", "\n", - "calibration_dataset = dataset.map(lambda x: preprocess_fn(x), remove_columns=dataset.column_names)" + "NUM_SAMPLES = 200\n", + "dataset = dataset.take(NUM_SAMPLES)\n", + "dataset = dataset.map(lambda x: preprocess_fn(x), remove_columns=dataset.column_names)\n", + "calibration_dataset = list(dataset)" ] }, { @@ -190,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "31c5415e-e22b-4ab9-b903-8791e80b188d", "metadata": { "execution": { @@ -204,7 +198,6 @@ }, "outputs": [], "source": [ - "NUM_SAMPLES = 200\n", "quantization_config = OVWeightQuantizationConfig(bits=8, dataset=calibration_dataset, num_samples=NUM_SAMPLES)\n", "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True, quantization_config=quantization_config)\n", "int8_pipe.save_pretrained(int8_model_path)" @@ -251,7 +244,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "6c2f615a-19e3-4ee2-9309-2ae1392c7f62", "metadata": { "execution": { @@ -290,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "90902149", "metadata": {}, "outputs": [], @@ -311,7 +304,36 @@ "execution_count": null, "id": "02f01fc1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "eb96ea3ef90f4b7488cb7b92853b5ef7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00" ] @@ -350,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "1eeaa81f-7fc5-49ba-80b8-2d95a1310a0c", "metadata": { "execution": { @@ -388,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "91134d48", "metadata": {}, "outputs": [ @@ -397,7 +419,7 @@ "output_type": "stream", "text": [ "FP32 model size: 4920.93 MB\n", - "INT8 model size: 1240.23 MB\n", + "INT8 model size: 1240.29 MB\n", "INT8 size decrease: 3.97x\n" ] } @@ -424,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "8806da79-0b3b-403e-a40c-61db6a0f482d", "metadata": { "execution": { @@ -447,7 +469,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6816982d5d7e412fb02d1efcf972a95d", + "model_id": "bdf72f14022b4944b2fe6f5e482b01f2", "version_major": 2, "version_minor": 0 }, @@ -461,7 +483,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7c51ada2ecb84e6398f82c10acd523ae", + "model_id": "fd9c55cebb4646b7b606a4d6b177889b", "version_major": 2, "version_minor": 0 }, @@ -475,7 +497,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7bf9616672ef4743a5e05f519e47ccae", + "model_id": "2599d0b1599a4bedac510cd0382262d0", "version_major": 2, "version_minor": 0 }, @@ -490,13 +512,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Latency of original FP32 model: 355705.40 ms\n" + "Latency of original FP32 model: 212177.56 ms\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2a8db52629c7434da87864707e1c8023", + "model_id": "8ca3ba9b0dca486dbc5fa6c5b508f1b8", "version_major": 2, "version_minor": 0 }, @@ -510,7 +532,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f65d645f05464727bb05b7a0028353fb", + "model_id": "27a597e75cc44a3db5360bbde7b613e8", "version_major": 2, "version_minor": 0 }, @@ -524,7 +546,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f1452100a86443e29f4d6f62473cfecf", + "model_id": "f15ec840672b4568aea1882539d7bb33", "version_major": 2, "version_minor": 0 }, @@ -539,14 +561,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Latency of quantized model: 315484.52 ms\n", - "Speedup: 1.13x\n" + "Latency of quantized model: 162504.42 ms\n", + "Speedup: 1.31x\n" ] } ], "source": [ "def get_val_dataset(num_items=3):\n", - " return [item[\"caption\"] for item in dataset.take(num_items)]\n", + " return [item[\"prompt\"] for item in dataset.take(num_items)]\n", "\n", "def benchmark(pipeline, dataset):\n", " \"\"\"\n", From aca2b6c9a238a1cf7819617812a40150dedea5b3 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 17 Apr 2024 16:20:31 +0200 Subject: [PATCH 18/25] Update setup optimum version (#670) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a8c43f51d4..3978fd1fd6 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", "transformers>=4.36.0,<4.40.0", - "optimum~=1.18", + "optimum~=1.19", "datasets>=1.4.0", "sentencepiece", "scipy", From 228a3e080922fe327c1d516eccbc5856178d2f1c Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 18 Apr 2024 10:16:01 +0200 Subject: [PATCH 19/25] Fix compatibility for latest itrex version (#658) * Fix compatibility for latest itrex version * Fix format * update example * update test * fix github workflow * clean tests * Update optimum/intel/neural_compressor/quantization.py Co-authored-by: Cheng, Penghui * format --------- Co-authored-by: Cheng, Penghui --- .github/workflows/test_inc.yml | 14 +- .../language-modeling/run_clm.py | 41 ++-- .../intel/neural_compressor/modeling_base.py | 24 +-- .../neural_compressor/modeling_decoder.py | 27 --- .../intel/neural_compressor/quantization.py | 122 ++++++------ tests/neural_compressor/test_ipex.py | 86 +++++++++ tests/neural_compressor/test_optimization.py | 177 ++++++++---------- tests/neural_compressor/utils_tests.py | 53 +++++- 8 files changed, 310 insertions(+), 234 deletions(-) delete mode 100644 optimum/intel/neural_compressor/modeling_decoder.py create mode 100644 tests/neural_compressor/test_ipex.py diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index 16c01e7298..1a8cb28bab 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -32,11 +32,17 @@ jobs: python -m pip install --upgrade pip pip install cmake pip install py-cpuinfo - pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] - pip install intel-extension-for-pytorch==2.1.100 - pip install intel-extension-for-transformers==1.3.2 + pip install intel-extension-for-transformers pip install peft + - name: Test with Pytest run: | - pytest tests/neural_compressor/ + pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0 + - name: Test IPEX + run: | + pip uninstall -y intel-extension-for-transformers + pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu + pip install intel-extension-for-pytorch==2.1.100 + pytest tests/neural_compressor/test_ipex.py + diff --git a/examples/neural_compressor/language-modeling/run_clm.py b/examples/neural_compressor/language-modeling/run_clm.py index ef24616307..1799ad6782 100644 --- a/examples/neural_compressor/language-modeling/run_clm.py +++ b/examples/neural_compressor/language-modeling/run_clm.py @@ -64,8 +64,7 @@ if is_intel_extension_for_transformers_available(): - from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig - + from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -227,8 +226,9 @@ class OptimizationArguments: metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."}, ) quantization_methodology: str = field( - default="RTN", - metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."}, + choices=["rtn", "gptq"], + default="rtn", + metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."}, ) damp_percent: float = field( default=0.01, @@ -662,22 +662,23 @@ def compute_metrics(eval_preds): raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization")) if optim_args.apply_pruning or optim_args.apply_distillation: raise ValueError("Weight only quantization and pruning or distillation cannot be combined.") - if optim_args.quantization_methodology == "GPTQ": - algorithm_args = { - "act_order": False, - "percdamp": optim_args.damp_percent, - "block_size": optim_args.gptq_block_size, - "nsamples": optim_args.num_calibration_samples, - "use_max_length": optim_args.use_max_length, - "pad_max_length": optim_args.pad_max_length, - } - quantization_config = WeightOnlyQuantConfig( - weight_dtype=optim_args.weight_dtype, - group_size=optim_args.group_size, - scheme=optim_args.weight_only_scheme, - algorithm=optim_args.quantization_methodology, - algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None, - ) + + algorithm_args = { + "weight_dtype": optim_args.weight_dtype, + "sym": optim_args.weight_only_scheme == "sym", + "group_size": optim_args.group_size, + } + + if optim_args.quantization_methodology == "gptq": + quantization_config = GPTQConfig( + damp_percent=optim_args.damp_percent, + nsamples=optim_args.num_calibration_samples, + blocksize=optim_args.gptq_block_size, + **algorithm_args, + ) + else: + quantization_config = RtnConfig(**algorithm_args) + else: quantization_config = PostTrainingQuantConfig( approach=optim_args.quantization_approach, recipes=recipes diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index c7a48aedb9..c46e3f41c5 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -67,11 +67,6 @@ """ -if is_intel_extension_for_transformers_available(): - from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM as ITREX_WOQ_MODEL - from intel_extension_for_transformers.transformers.utils import WeightOnlyQuantConfig - - class INCModel(OptimizedModel): auto_model_class = AutoModel export_feature = "feature-extraction" @@ -142,15 +137,16 @@ def _from_pretrained( msg = None if is_intel_extension_for_transformers_available(): try: - quantization_config = WeightOnlyQuantConfig.from_pretrained(model_id) - algorithm = getattr(quantization_config, "algorithm", None) - if algorithm is not None and quantization_config.algorithm.lower() in { - "rtn", - "gptq", - "awq", - "autoaround", - }: - return ITREX_WOQ_MODEL.from_pretrained( + quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json") + algorithm = getattr(quantization_config, "quant_method", None) + if algorithm in {"rtn", "gptq", "awq", "autoaround"}: + from intel_extension_for_transformers.transformers.modeling.modeling_auto import ( + _BaseQBitsAutoModelClass, + ) + + _BaseQBitsAutoModelClass.ORIG_MODEL = cls.auto_model_class + + return _BaseQBitsAutoModelClass.from_pretrained( pretrained_model_name_or_path=model_id, use_auth_token=use_auth_token, revision=revision, diff --git a/optimum/intel/neural_compressor/modeling_decoder.py b/optimum/intel/neural_compressor/modeling_decoder.py deleted file mode 100644 index f56969675b..0000000000 --- a/optimum/intel/neural_compressor/modeling_decoder.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import warnings - -from .modeling_base import INCModelForCausalLM - - -class INCModelForCausalLM(INCModelForCausalLM): - # warning at import time - warnings.warn( - "Importing `INCModelForCausalLM` from `optimum/intel/neural_compressor/modeling_decoder.py` is deprecated and will " - "be removed in a future verson of optimum-intel. Import as `from optimum.intel.neural_compressor import INCModelForCausalLM instead.", - FutureWarning, - ) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index c20302c4bd..09f651df05 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -72,16 +72,22 @@ from .utils import INCDataLoader, _cfgs_to_fx_cfgs +INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION = "1.4.0" + if is_intel_extension_for_transformers_available(): - INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION = "1.3.2" if is_intel_extension_for_transformers_version("!=", INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION): raise ImportError( f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_intel_extension_for_transformers_version}, " f"but only version {INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION} is supported." ) - from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model + from intel_extension_for_transformers.transformers.llm.quantization.utils import convert_to_quantized_model from intel_extension_for_transformers.transformers.modeling.modeling_auto import save_low_bit - from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig + from intel_extension_for_transformers.transformers.utils.config import ( + AwqConfig, + GPTQConfig, + ITREXQuantizationConfigMixin, + RtnConfig, + ) logger = logging.getLogger(__name__) @@ -89,7 +95,7 @@ NEURAL_COMPRESSOR_MINIMUM_VERSION = "2.1.0" NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION = "2.3.0" IPEX_MINIMUM_VERSION = "2.1.0" -_ITREX_TORCH_VERSION = "2.1.0" +ITREX_MINIMUM_TORCH_VERSION = "2.2.0" if is_neural_compressor_version("<", NEURAL_COMPRESSOR_MINIMUM_VERSION): raise ImportError( @@ -152,21 +158,20 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs): def quantize( self, - quantization_config: Union["PostTrainingQuantConfig", "WeightOnlyQuantConfig"], + quantization_config: Union["PostTrainingQuantConfig", "ITREXQuantizationConfigMixin"], save_directory: Union[str, Path], calibration_dataset: Dataset = None, batch_size: int = 8, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, file_name: str = None, - weight_only: bool = False, **kwargs, ): """ Quantize a model given the optimization specifications defined in `quantization_config`. Args: - quantization_config (`Union[PostTrainingQuantConfig, WeightOnlyQuantConfig]`): + quantization_config (`Union[PostTrainingQuantConfig, ITREXQuantizationConfigMixin]`): The configuration containing the parameters related to quantization. save_directory (`Union[str, Path]`): The directory where the quantized model should be saved. @@ -178,9 +183,6 @@ def quantize( The function to use to form a batch from a list of elements of the calibration dataset. remove_unused_columns (`bool`, defaults to `True`): Whether or not to remove the columns unused by the model forward method. - weight_only (`bool`, defaults to `False`): - Whether compress weights to integer precision (4-bit by default) while keeping activations - floating-point. Fits best for LLM footprint reduction and performance acceleration. """ save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) @@ -188,16 +190,41 @@ def quantize( device = kwargs.pop("device", "cpu") use_cpu = device == torch.device("cpu") or device == "cpu" use_xpu = device == torch.device("xpu") or device == "xpu" + calibration_dataloader = None - if save_onnx_model and (isinstance(self._original_model, ORTModel) or weight_only): + if save_onnx_model and isinstance(self._original_model, ORTModel): save_onnx_model = False logger.warning("Model provided is an ONNX model, `save_onnx_model` is set to False") default_name = WEIGHTS_NAME if not isinstance(self._original_model, ORTModel) else ONNX_WEIGHTS_NAME - calibration_dataloader = None self._set_task() - if weight_only or not isinstance(quantization_config, PostTrainingQuantConfig): + if kwargs.pop("weight_only", None) is not None: + logger.warning( + "`weight_only` is deprecated. Use `quantization_config` instead to specify which methodology and quantization pamraters to apply." + ) + + if ( + isinstance(quantization_config, PostTrainingQuantConfig) + and quantization_config.backend == "ipex" + and is_ipex_version("<", IPEX_MINIMUM_VERSION) + and "generation" in self.task + ): + raise ImportError( + f"Found an incompatible version of intel-extension-for-pytorch. Found version {_ipex_version}, " + f"but only version {IPEX_MINIMUM_VERSION} or higher is supported." + ) + + if save_onnx_model: + if ( + not isinstance(quantization_config, PostTrainingQuantConfig) + or INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.DYNAMIC + ): + logger.warning("ONNX export for dynamic and weight only quantized model is not supported.") + save_onnx_model = False + + # ITREX Weight Only Quantization + if not isinstance(quantization_config, PostTrainingQuantConfig): # check neural-compressor version if is_neural_compressor_version("<", NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION): raise ImportError( @@ -207,53 +234,45 @@ def quantize( if not is_intel_extension_for_transformers_available(): raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("Weight only quantization")) - if is_torch_version("!=", _ITREX_TORCH_VERSION): + if is_torch_version("<", ITREX_MINIMUM_TORCH_VERSION): raise ImportError( f"Found an incompatible version of `torch`. Found version {_torch_version}, " - f"but only version {_ITREX_TORCH_VERSION} is supported." + f"but only version {ITREX_MINIMUM_TORCH_VERSION} or higher is supported." ) - if quantization_config is None: - quantization_config = WeightOnlyQuantConfig() - algo = "RTN" - elif isinstance(quantization_config, WeightOnlyQuantConfig): - algo = quantization_config.algorithm - else: + if not isinstance(quantization_config, ITREXQuantizationConfigMixin): raise TypeError( - f"For weight-only quantization, `quantization_config` should be an instance of `WeightOnlyQuantConfig`, but got: {type(quantization_config)} instead." + "`quantization_config` should either be an instance of `neural_compressor.config.PostTrainingQuantConfig` or " + f"`intel_extension_for_transformers.transformers.utils.config.ITREXQuantizationConfigMixin` but got: {type(quantization_config)} instead." ) - if algo not in ["RTN", "GPTQ"]: - raise ValueError(f"Weight-only quantization is only support RTN and GPTQ algorithm now!But got {algo}") + if not isinstance(quantization_config, (GPTQConfig, RtnConfig)): + raise ValueError( + f"Weight-only quantization is only support RTN and GPTQ algorithm now! But got {quantization_config}" + ) - if calibration_dataset is None and quantization_config.tokenizer is None and ("GPTQ" in algo): + if calibration_dataset is None and isinstance(quantization_config, (GPTQConfig, AwqConfig)): raise ValueError( "Weight-only quantization needs a calibration dataset for both GPTQ and AWQ methodologies." ) - if calibration_dataset is None: - calibration_dataloader = None - else: + if calibration_dataset is not None: calibration_dataloader = self._get_calibration_dataloader( calibration_dataset=calibration_dataset, batch_size=batch_size, remove_unused_columns=remove_unused_columns, data_collator=data_collator, - use_label=False if "GPTQ" in algo else True, + use_label=not isinstance(quantization_config, (GPTQConfig)), ) quantization_config.calib_dataloader = calibration_dataloader - save_onnx_model = False - elif INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.STATIC: # Since PyTorch fx trace does not really require an example_inputs, only need calibration_dataset or calibration_fn here. if calibration_dataset is None and self.calibration_fn is None: raise ValueError( "Post-training static quantization needs a calibration dataset or a calibration_function." ) - if calibration_dataset is None: - calibration_dataloader = None - else: + if calibration_dataset is not None: quantization_config.calibration_sampling_size = len(calibration_dataset) calibration_dataloader = self._get_calibration_dataloader( calibration_dataset=calibration_dataset, @@ -266,45 +285,24 @@ def quantize( logger.warning("ONNX export is no supported for model with quantized embeddings") save_onnx_model = False - else: - # Disable ONNX export for dynamically quantized model as deprecated in neural-compressor>=2.2.0 - if save_onnx_model: - logger.warning( - "ONNX export for dynamic quantized model is no longer supported by neural-compressor>=2.2.0. " - "To apply dynamic quantization on an ONNX model, you can use optimum.onnxruntime.ORTQuantizer" - ) - save_onnx_model = False - - if ( - isinstance(quantization_config, PostTrainingQuantConfig) - and quantization_config.backend == "ipex" - and is_ipex_version("<", IPEX_MINIMUM_VERSION) - and "generation" in self.task - ): - raise ImportError( - f"Found an incompatible version of intel-extension-for-pytorch. Found version {_ipex_version}, " - f"but only version {IPEX_MINIMUM_VERSION} or higher is supported." - ) - if not isinstance(quantization_config, PostTrainingQuantConfig): if use_cpu: # will remove after intel-extension-for-transformers 1.3.3 release. quantization_config.device = "cpu" - quantization_config.post_init() + quantization_config.post_init_cpu() elif use_xpu: # will remove after intel-extension-for-transformers 1.3.3 release. quantization_config.device = "xpu" quantization_config.post_init_xpu() + self._quantized_model = convert_to_quantized_model( self._original_model, quantization_config, device=quantization_config.device ) - # will remove after intel-extension-for-transformers 1.3.3 release. - if hasattr(quantization_config, "calib_dataloader"): - quantization_config.calib_dataloader = None + self._quantized_model.quantization_config = quantization_config self._quantized_model.save_pretrained = types.MethodType(save_low_bit, self._quantized_model) - # Save the quantized model self._quantized_model.save_pretrained(save_directory) + else: if isinstance(self._original_model.config, PretrainedConfig): self._original_model.config.backend = quantization_config.backend @@ -336,10 +334,7 @@ def quantize( ) if not hasattr(compressed_model, "_model") or compressed_model._model is None: - raise RuntimeError( - "The maximum number of trials specified has been reached and no quantized model meeting the specified" - " accuracy tolerance has been found. Either the tolerance or the number of trials need to be increased." - ) + raise RuntimeError("Calling `neural_compressor.fit` returned unexpected results") if isinstance(self._original_model.config, PretrainedConfig): # If backend is IPEX, then the quantized model is JIT model which will drop the config attribute, @@ -376,7 +371,6 @@ def quantize( self._save_pretrained(compressed_model, output_path) quantization_config = INCConfig(quantization=quantization_config, save_onnx_model=save_onnx_model) quantization_config.save_pretrained(save_directory) - return self._quantized_model @staticmethod def _save_pretrained(model: Union[PyTorchModel, IPEXModel], output_path: str): diff --git a/tests/neural_compressor/test_ipex.py b/tests/neural_compressor/test_ipex.py new file mode 100644 index 0000000000..ef16dbed19 --- /dev/null +++ b/tests/neural_compressor/test_ipex.py @@ -0,0 +1,86 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ruff: noqa + + +import os +import tempfile + +from neural_compressor.config import PostTrainingQuantConfig + +from parameterized import parameterized +from transformers import ( + AutoModelForCausalLM, + AutoModelForQuestionAnswering, + AutoTokenizer, + set_seed, +) +from utils_tests import MODEL_NAMES, SEED, INCTestMixin, _generate_dataset + + +from optimum.intel import ( + INCConfig, + INCModelForCausalLM, + INCModelForSeq2SeqLM, + INCModelForQuestionAnswering, + INCModelForSequenceClassification, + INCModelForMaskedLM, + INCModelForTokenClassification, + INCQuantizer, + INCSeq2SeqTrainer, +) +from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification +from optimum.pipelines import ORT_SUPPORTED_TASKS + + +os.environ["CUDA_VISIBLE_DEVICES"] = "" +set_seed(SEED) + + +class IPEXQuantizationTest(INCTestMixin): + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("text-classification", "bert", 21),) + + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) + def test_ipex_static_quantization_with_smoothquant(self, task, model_arch, expected_quantized_matmuls): + recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": 0.5}} + num_samples = 10 + model_name = MODEL_NAMES[model_arch] + quantization_config = PostTrainingQuantConfig(approach="static", backend="ipex", recipes=recipes) + model = ORT_SUPPORTED_TASKS[task]["class"][0].auto_model_class.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + quantizer = INCQuantizer.from_pretrained(model, task=task) + calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) + + with tempfile.TemporaryDirectory() as tmp_dir: + quantizer.quantize( + quantization_config=quantization_config, + calibration_dataset=calibration_dataset, + save_directory=tmp_dir, + save_onnx_model=False, + ) + self.check_model_outputs( + q_model=quantizer._quantized_model, + task=task, + tokenizer=tokenizer, + save_directory=tmp_dir, + expected_quantized_matmuls=expected_quantized_matmuls, + is_static=True, + load_onnx_model=False, + num_samples=num_samples, + load_inc_model=False, + load_ipex_model=True, + ) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 9d85b85cbd..e38ba8e327 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -44,7 +44,7 @@ pipeline, set_seed, ) -from utils_tests import SEED, INCTestMixin, _generate_dataset +from utils_tests import MODEL_NAMES, SEED, INCTestMixin, _generate_dataset from optimum.intel.utils.import_utils import is_torch_version, is_intel_extension_for_transformers_available @@ -64,38 +64,30 @@ from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification from optimum.pipelines import ORT_SUPPORTED_TASKS -if is_intel_extension_for_transformers_available(): - from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig os.environ["CUDA_VISIBLE_DEVICES"] = "" set_seed(SEED) -class OptimizationTest(INCTestMixin): +class QuantizationTest(INCTestMixin): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( - ("text-classification", "hf-internal-testing/tiny-random-BertForSequenceClassification", 21), - # ("text-generation", "hf-internal-testing/tiny-random-BloomForCausalLM", 21), # TODO : enable causal lm task once INC ONNX export fixed + ("text-classification", "bert", 21), + # ("text-generation", "bloom", 21), ) SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS + ( - ("fill-mask", "hf-internal-testing/tiny-random-BertForMaskedLM", 22), - ("token-classification", "hf-internal-testing/tiny-random-AlbertForTokenClassification", 26), + ("fill-mask", "bert", 22), + ("token-classification", "albert", 26), ) TEXT_GENERATION_SUPPORTED_ARCHITECTURES = ( - "hf-internal-testing/tiny-random-BloomForCausalLM", - "hf-internal-testing/tiny-random-GPTNeoForCausalLM", - ) - - WEIGHT_ONLY_CONFIG = ( - ("RTN", "int4_clip"), - ("GPTQ", "int4_clip"), - ("RTN", "int8"), - ("", ""), + "bloom", + "gpt_neo", ) @parameterized.expand(SUPPORTED_ARCHITECTURES_DYNAMIC) - def test_dynamic_quantization(self, task, model_name, expected_quantized_matmuls): + def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls): + model_name = MODEL_NAMES[model_arch] quantization_config = PostTrainingQuantConfig(approach="dynamic") model_class = ORT_SUPPORTED_TASKS[task]["class"][0] tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -130,8 +122,9 @@ def test_dynamic_quantization(self, task, model_name, expected_quantized_matmuls ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_static_quantization(self, task, model_name, expected_quantized_matmuls): + def test_static_quantization(self, task, model_arch, expected_quantized_matmuls): num_samples = 10 + model_name = MODEL_NAMES[model_arch] model_class = ORT_SUPPORTED_TASKS[task]["class"][0] tokenizer = AutoTokenizer.from_pretrained(model_name) if tokenizer.pad_token is None: @@ -175,82 +168,6 @@ def test_static_quantization(self, task, model_name, expected_quantized_matmuls) num_samples=num_samples, ) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - @unittest.skipIf(is_torch_version(">=", "2.2.0"), "compatibility issue with torch 2.2.0 and IPEX latest version") - def test_ipex_static_quantization_with_smoothquant(self, task, model_name, expected_quantized_matmuls): - recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": 0.5}} - num_samples = 10 - quantization_config = PostTrainingQuantConfig(approach="static", backend="ipex", recipes=recipes) - model = ORT_SUPPORTED_TASKS[task]["class"][0].auto_model_class.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - quantizer = INCQuantizer.from_pretrained(model, task=task) - calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) - - with tempfile.TemporaryDirectory() as tmp_dir: - quantizer.quantize( - quantization_config=quantization_config, - calibration_dataset=calibration_dataset, - save_directory=tmp_dir, - save_onnx_model=False, - ) - self.check_model_outputs( - q_model=quantizer._quantized_model, - task=task, - tokenizer=tokenizer, - save_directory=tmp_dir, - expected_quantized_matmuls=expected_quantized_matmuls, - is_static=True, - load_onnx_model=False, - num_samples=num_samples, - load_inc_model=False, - load_ipex_model=True, - ) - - @parameterized.expand(WEIGHT_ONLY_CONFIG) - @unittest.skipIf( - not is_intel_extension_for_transformers_available(), reason="Intel-extension-for-transformers not available!" - ) - def test_weight_only_quantization(self, methodology, weight_dtype): - model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM" - model = AutoModelForCausalLM.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation") - calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2) - - with tempfile.TemporaryDirectory() as tmp_dir: - if methodology: - gptq_args = { - "percdamp": 0.01, - "act_order": False, - "scheme": "sym", - } - - quantization_config = WeightOnlyQuantConfig( - algorithm=methodology, - algorithm_args=gptq_args if methodology == "GPTQ" else None, - weight_dtype=weight_dtype, - ) - quantizer.quantize( - quantization_config=quantization_config, - calibration_dataset=calibration_dataset, - save_directory=tmp_dir, - ) - else: - quantizer.quantize( - quantization_config=None, - save_directory=tmp_dir, - weight_only=True, # use RTN quantization method and NF4 weight data type is default. - ) - - q_model = INCModelForCausalLM.from_pretrained(tmp_dir) - inp = torch.tensor([calibration_dataset[0]["input_ids"]]) - out = model(inp)[0] - q_out = q_model(inp)[0] - self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1))) - def test_dynamic_accuracy_strategy_quantization(self): model_name = "distilbert-base-cased-distilled-squad" model = AutoModelForQuestionAnswering.from_pretrained(model_name) @@ -330,7 +247,8 @@ def test_dynamic_diffusion_model(self): self.assertTrue(np.allclose(loaded_pipe_outputs, outputs, atol=1e-4)) @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES) - def test_quantize_text_generate_model(self, model_id): + def test_quantize_text_generate_model(self, model_arch): + model_id = MODEL_NAMES[model_arch] set_seed(42) model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -357,8 +275,13 @@ def calibration_fn(p_model): outputs = model.generate(**tokens, do_sample=False, num_beams=1, temperature=0.9, min_length=20, max_length=20) self.assertTrue(torch.equal(pre_outputs, outputs)) + +class TrainingOptimizationTest(INCTestMixin): + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("text-classification", "bert", 21),) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_aware_training_quantization(self, task, model_name, expected_quantized_matmuls): + def test_aware_training_quantization(self, task, model_arch, expected_quantized_matmuls): + model_name = MODEL_NAMES[model_arch] quantization_config = QuantizationAwareTrainingConfig() save_onnx_model = False @@ -381,7 +304,8 @@ def test_aware_training_quantization(self, task, model_name, expected_quantized_ ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_aware_training_quantization_pruning(self, task, model_name, expected_quantized_matmuls): + def test_aware_training_quantization_pruning(self, task, model_arch, expected_quantized_matmuls): + model_name = MODEL_NAMES[model_arch] quantization_config = QuantizationAwareTrainingConfig() target_sparsity = 0.9 pruning_config = WeightPruningConfig( @@ -413,7 +337,8 @@ def test_aware_training_quantization_pruning(self, task, model_name, expected_qu ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_magnitude_pruning(self, task, model_name, expected_quantized_matmuls): + def test_magnitude_pruning(self, task, model_arch, expected_quantized_matmuls): + model_name = MODEL_NAMES[model_arch] target_sparsity = 0.9 # end_step should be training_args.num_train_epochs * (len(train_dataset) // training_args.per_device_train_batch_size) pruning_config = WeightPruningConfig( @@ -452,7 +377,8 @@ def test_magnitude_pruning(self, task, model_name, expected_quantized_matmuls): self.assertEqual(inc_config.pruning["pattern"], "4x1") @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_distillation(self, task, model_name, expected_quantized_matmuls): + def test_distillation(self, task, model_arch, expected_quantized_matmuls): + model_name = MODEL_NAMES[model_arch] teacher_model = ORT_SUPPORTED_TASKS[task]["class"][0].auto_model_class.from_pretrained(model_name) distillation_config = DistillationConfig(teacher_model=teacher_model) save_onnx_model = True @@ -575,3 +501,54 @@ def _compute_metrics(pred): self.assertIsInstance(loaded_model_outputs.logits, torch.Tensor) # Compare tensor outputs # self.assertTrue(torch.allclose(loaded_model_outputs.logits, model_outputs.logits, atol=1e-4)) + + +class WeightOnlyQuantizationTest(INCTestMixin): + WEIGHT_ONLY_CONFIG = ( + ("rtn", "int4_clip"), + ("rtn", "int8"), + ("gptq", "int4_clip"), + ) + + @parameterized.expand(WEIGHT_ONLY_CONFIG) + @unittest.skipIf(not is_intel_extension_for_transformers_available(), reason="ITREX not available") + def test_weight_only_quantization(self, methodology, weight_dtype): + model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM" + + from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig + + bits = 4 if "4" in weight_dtype else 8 + if methodology == "gptq": + # max_input_length can be removed after neural-compressor > v2.5.1 + quantization_config = GPTQConfig( + bits=bits, sym=True, damp_percent=0.01, weight_dtype=weight_dtype, max_input_length=128 + ) + else: + quantization_config = RtnConfig(bits=bits, weight_dtype=weight_dtype) + + model = AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation") + calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2) + + with tempfile.TemporaryDirectory() as tmp_dir: + quantizer.quantize( + quantization_config=quantization_config, + calibration_dataset=calibration_dataset, + save_directory=tmp_dir, + ) + loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir) + + tokens = tokenizer("This is a sample output", return_tensors="pt") + + with torch.no_grad(): + loaded_outputs = loaded_model(**tokens) + # quantizer_outputs = model(**tokens) + + self.assertTrue("logits" in loaded_outputs) + self.assertIsInstance(loaded_outputs.logits, torch.Tensor) + self.assertTrue("past_key_values" in loaded_outputs) + self.assertIsInstance(loaded_outputs.past_key_values, tuple) + + # self.assertTrue(torch.allclose(quantizer_outputs.logits, loaded_outputs.logits, equal_nan=True, atol=1e-4)) diff --git a/tests/neural_compressor/utils_tests.py b/tests/neural_compressor/utils_tests.py index 214aa73be5..c91270355a 100644 --- a/tests/neural_compressor/utils_tests.py +++ b/tests/neural_compressor/utils_tests.py @@ -41,15 +41,15 @@ ) -from optimum.intel.utils.import_utils import is_torch_version +from optimum.intel.utils.import_utils import is_ipex_available from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS from optimum.intel.utils.constant import ONNX_WEIGHTS_NAME from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification from optimum.pipelines import ORT_SUPPORTED_TASKS -if is_torch_version("<", "2.2.0"): - from optimum.intel.ipex import ( +if is_ipex_available(): + from optimum.intel import ( IPEXModelForCausalLM, IPEXModelForSequenceClassification, IPEXModelForMaskedLM, @@ -65,6 +65,50 @@ } +MODEL_NAMES = { + "albert": "hf-internal-testing/tiny-random-albert", + "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", + "bert": "hf-internal-testing/tiny-random-bert", + "bart": "hf-internal-testing/tiny-random-bart", + "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", + "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", + "bloom": "hf-internal-testing/tiny-random-BloomModel", + "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", + "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", + "convnext": "hf-internal-testing/tiny-random-convnext", + "distilbert": "hf-internal-testing/tiny-random-distilbert", + "electra": "hf-internal-testing/tiny-random-electra", + "flaubert": "hf-internal-testing/tiny-random-flaubert", + "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", + "gpt2": "hf-internal-testing/tiny-random-gpt2", + "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", + "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", + "gptj": "hf-internal-testing/tiny-random-GPTJModel", + "levit": "hf-internal-testing/tiny-random-LevitModel", + "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama2": "Jiqing/tiny_random_llama2", + "marian": "sshleifer/tiny-marian-en-de", + "mbart": "hf-internal-testing/tiny-random-mbart", + "mistral": "echarlaix/tiny-random-mistral", + "mobilenet_v1": "google/mobilenet_v1_0.75_192", + "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", + "mobilevit": "hf-internal-testing/tiny-random-mobilevit", + "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", + "mt5": "stas/mt5-tiny-random", + "opt": "hf-internal-testing/tiny-random-OPTModel", + "phi": "echarlaix/tiny-random-PhiForCausalLM", + "resnet": "hf-internal-testing/tiny-random-resnet", + "roberta": "hf-internal-testing/tiny-random-roberta", + "roformer": "hf-internal-testing/tiny-random-roformer", + "squeezebert": "hf-internal-testing/tiny-random-squeezebert", + "t5": "hf-internal-testing/tiny-random-t5", + "unispeech": "hf-internal-testing/tiny-random-unispeech", + "vit": "hf-internal-testing/tiny-random-vit", + "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", + "xlm": "hf-internal-testing/tiny-random-xlm", +} + + def num_quantized_matmul_onnx_model(onnx_model): num_quantized_matmul = 0 for node in onnx_model.graph.node: @@ -145,8 +189,7 @@ def check_model_outputs( ort_model = ORT_SUPPORTED_TASKS[task]["class"][0].from_pretrained(save_directory, **model_kwargs) ort_outputs = ort_model(**tokens) self.assertTrue("logits" in ort_outputs) - if task != "fill-mask": - self.assertTrue(torch.allclose(ort_outputs.logits, outputs, atol=1e-2)) + # self.assertTrue(torch.allclose(ort_outputs.logits, outputs, atol=1e-2)) @staticmethod def get_trainer( From 4651ac2c4a05e047c86c66a3347b39cb6384cd0b Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Thu, 18 Apr 2024 11:11:08 +0100 Subject: [PATCH 20/25] Export hybrid StableDiffusion models via optimum-cli (#618) * Export hybrid StableDiffusion models via optimum-cli * Add doc and test * Remove huggingface_hub * remove quantization from main_export * remove unused function * Infer task by loading the diffusers config * Fix style * fix tests --------- Co-authored-by: Ella Charlaix --- README.md | 8 +- optimum/commands/export/openvino.py | 80 ++++++++++++++++---- optimum/exporters/openvino/__main__.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 2 +- optimum/intel/openvino/utils.py | 1 + tests/openvino/test_exporters_cli.py | 20 +++++ 6 files changed, 95 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 78ca130145..41537d8971 100644 --- a/README.md +++ b/README.md @@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a optimum-cli export openvino --model gpt2 ov_model ``` -You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision. +You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision. ```plain optimum-cli export openvino --model gpt2 --weight-format int8 ov_model ``` +Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers. + +```plain +optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model +``` + To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov). #### Inference: diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 6c17a333ef..40901fbf90 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Optional from ...exporters import TasksManager +from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available from ..base import BaseOptimumCLICommand, CommandInfo @@ -104,6 +105,16 @@ def parse_args_openvino(parser: "ArgumentParser"): default=None, help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."), ) + optional_group.add_argument( + "--dataset", + type=str, + default=None, + help=( + "The dataset used for data-aware compression or quantization with NNCF. " + "You can use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs " + "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models." + ), + ) optional_group.add_argument( "--disable-stateful", action="store_true", @@ -195,20 +206,59 @@ def run(self): ) quantization_config["sym"] = "asym" not in self.args.weight_format quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64 + quantization_config["dataset"] = self.args.dataset ov_config = OVConfig(quantization_config=quantization_config) - # TODO : add input shapes - main_export( - model_name_or_path=self.args.model, - output=self.args.output, - task=self.args.task, - framework=self.args.framework, - cache_dir=self.args.cache_dir, - trust_remote_code=self.args.trust_remote_code, - pad_token_id=self.args.pad_token_id, - ov_config=ov_config, - stateful=not self.args.disable_stateful, - convert_tokenizer=self.args.convert_tokenizer, - library_name=self.args.library - # **input_shapes, - ) + library_name = TasksManager.infer_library_from_model(self.args.model) + + if ( + library_name == "diffusers" + and ov_config + and ov_config.quantization_config + and ov_config.quantization_config.dataset is not None + ): + if not is_diffusers_available(): + raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models")) + + from diffusers import DiffusionPipeline + + diffusers_config = DiffusionPipeline.load_config(self.args.model) + class_name = diffusers_config.get("_class_name", None) + + if class_name == "LatentConsistencyModelPipeline": + from optimum.intel import OVLatentConsistencyModelPipeline + + model_cls = OVLatentConsistencyModelPipeline + + elif class_name == "StableDiffusionXLPipeline": + from optimum.intel import OVStableDiffusionXLPipeline + + model_cls = OVStableDiffusionXLPipeline + elif class_name == "StableDiffusionPipeline": + from optimum.intel import OVStableDiffusionPipeline + + model_cls = OVStableDiffusionPipeline + else: + raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.") + + model = model_cls.from_pretrained( + self.args.model, export=True, quantization_config=ov_config.quantization_config + ) + model.save_pretrained(self.args.output) + + else: + # TODO : add input shapes + main_export( + model_name_or_path=self.args.model, + output=self.args.output, + task=self.args.task, + framework=self.args.framework, + cache_dir=self.args.cache_dir, + trust_remote_code=self.args.trust_remote_code, + pad_token_id=self.args.pad_token_id, + ov_config=ov_config, + stateful=not self.args.disable_stateful, + convert_tokenizer=self.args.convert_tokenizer, + library_name=library_name, + # **input_shapes, + ) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 8b8cc09fc1..5f74c1de8b 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -77,7 +77,7 @@ def main_export( model_name_or_path (`str`): Model ID on huggingface.co or path on disk to the model repository to export. output (`Union[str, Path]`): - Path indicating the directory where to store the generated ONNX model. + Path indicating the directory where to store the generated OpenVINO model. > Optional parameters diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 1e562749b2..7bc7cca04c 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -387,7 +387,7 @@ def transform_fn(data_item): self.__call__(**inputs, height=height, width=width) else: self.__call__(*inputs, height=height, width=width) - if len(calibration_data) > num_samples: + if len(calibration_data) >= num_samples: break self.unet.request = self.unet.request.request diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index a0439d2129..4d1479f733 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -96,6 +96,7 @@ "stable-diffusion": "OVStableDiffusionPipeline", "stable-diffusion-xl": "OVStableDiffusionXLPipeline", "pix2struct": "OVModelForPix2Struct", + "latent-consistency": "OVLatentConsistencyModelPipeline", } diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 46c6e3c69a..7d618c530e 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -26,6 +26,7 @@ from optimum.exporters.openvino.__main__ import main_export from optimum.intel import ( # noqa + OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, OVModelForFeatureExtraction, @@ -77,6 +78,12 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-xl": 0, # not supported } + SUPPORTED_SD_HYBRID_ARCHITECTURES = ( + ("stable-diffusion", 72, 195), + ("stable-diffusion-xl", 84, 331), + ("latent-consistency", 50, 135), + ) + SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),) SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"] @@ -176,6 +183,19 @@ def test_exporters_cli_int8(self, task: str, model_type: str): _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_int8[i], num_int8) + @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES) + def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: int, exp_num_int8: int): + with TemporaryDirectory() as tmpdir: + subprocess.run( + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --dataset laion/filtered-wit --weight-format int8 {tmpdir}", + shell=True, + check=True, + ) + model = eval(_HEAD_TO_AUTOMODELS[model_type]).from_pretrained(tmpdir) + num_fq, num_int8, _ = get_num_quantized_nodes(model.unet) + self.assertEqual(exp_num_int8, num_int8) + self.assertEqual(exp_num_fq, num_fq) + @parameterized.expand(TEST_4BIT_CONFIGURATONS) def test_exporters_cli_int4(self, task: str, model_type: str, option: str): with TemporaryDirectory() as tmpdir: From 0d943f842cea30c94877bc954958f18327ecd288 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 18 Apr 2024 16:52:00 +0400 Subject: [PATCH 21/25] Convert Tokenizers By Default (#580) * Convert Tokenizers By Default * Add Warning to Deprecated Option * Update OV Tokenizers Availability Check * Move openvino-tokenizers to openvino dependencies * Make Style * Change Imports to Absolute * Check openvino-nightly compatibility * Change model skip explanation * Update OV Tokenizers Availability Check * Add Check for OpenVINO Nightly and Archive * Add linux distros compatibility message * Address Review Comments * Address Review Comments * Address Review Comments * Fix Style * Change Warnings to Debug Level * Fix Tests Debug Message * Fix Style * Fix Style --- optimum/commands/export/openvino.py | 12 ++- optimum/exporters/openvino/__main__.py | 15 ++-- optimum/exporters/openvino/convert.py | 14 +--- optimum/intel/utils/import_utils.py | 103 ++++++++++++++++++------- setup.py | 9 +-- tests/openvino/test_exporters_cli.py | 24 +++--- 6 files changed, 107 insertions(+), 70 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 40901fbf90..3015d7b5b5 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -126,10 +126,15 @@ def parse_args_openvino(parser: "ArgumentParser"): "OpenVINO native inference code that expects kv-cache inputs and outputs in the model." ), ) + optional_group.add_argument( + "--disable-convert-tokenizer", + action="store_true", + help="Do not add converted tokenizer and detokenizer OpenVINO models.", + ) optional_group.add_argument( "--convert-tokenizer", action="store_true", - help="Add converted tokenizer and detokenizer with OpenVINO Tokenizers", + help="[Deprecated] Add converted tokenizer and detokenizer with OpenVINO Tokenizers.", ) optional_group.add_argument( @@ -247,6 +252,9 @@ def run(self): model.save_pretrained(self.args.output) else: + if self.args.convert_tokenizer: + logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.") + # TODO : add input shapes main_export( model_name_or_path=self.args.model, @@ -258,7 +266,7 @@ def run(self): pad_token_id=self.args.pad_token_id, ov_config=ov_config, stateful=not self.args.disable_stateful, - convert_tokenizer=self.args.convert_tokenizer, + convert_tokenizer=not self.args.disable_convert_tokenizer, library_name=library_name, # **input_shapes, ) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 5f74c1de8b..d7b29584d6 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -22,11 +22,10 @@ from optimum.exporters import TasksManager from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED +from optimum.exporters.openvino.convert import export_from_model, export_tokenizer +from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version from optimum.utils.save_utils import maybe_load_preprocessors -from ...intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version -from .convert import export_from_model, export_tokenizer - if TYPE_CHECKING: from optimum.intel.openvino.configuration import OVConfig @@ -187,12 +186,6 @@ def main_export( f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" ) - if convert_tokenizer and not is_openvino_tokenizers_available(): - logger.warning( - "`convert_tokenizer` requires openvino-tokenizers, please install it with `pip install optimum-intel[openvino-tokenizers]`" - ) - convert_tokenizer = False - do_gptq_patching = False custom_architecture = False loading_kwargs = {} @@ -348,7 +341,7 @@ class StoreAttr(object): **kwargs_shapes, ) - if convert_tokenizer: + if convert_tokenizer and is_openvino_tokenizers_available(): if library_name != "diffusers": tokenizer = next( (preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)), @@ -371,6 +364,8 @@ class StoreAttr(object): tokenizer_2 = getattr(model, "tokenizer_2", None) if tokenizer_2 is not None: export_tokenizer(tokenizer_2, output, suffix="_2") + elif convert_tokenizer and not is_openvino_tokenizers_available(): + logger.warning("Tokenizer won't be converted.") # Unpatch modules after GPTQ export if do_gptq_patching: diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 5dd7c7bd90..55e3318017 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -20,7 +20,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union -from transformers import T5Tokenizer, T5TokenizerFast from transformers.utils import is_tf_available, is_torch_available from openvino.runtime import PartialShape, save_model @@ -49,9 +48,6 @@ ) -UNSUPPORTED_TOKENIZER_CLASSES = (T5Tokenizer, T5TokenizerFast) - - logger = logging.getLogger(__name__) if is_torch_available(): @@ -662,10 +658,6 @@ def export_tokenizer( ): from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME # avoid circular imports - if isinstance(tokenizer, UNSUPPORTED_TOKENIZER_CLASSES): - logger.info(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.") - return - try: from openvino_tokenizers import convert_tokenizer except ModuleNotFoundError: @@ -681,13 +673,13 @@ def export_tokenizer( try: converted = convert_tokenizer(tokenizer, with_detokenizer=True) except NotImplementedError: - logger.warning("Detokenizer is not supported, convert tokenizer only.") + logger.info("Detokenizer is not supported, convert tokenizer only.") converted = convert_tokenizer(tokenizer, with_detokenizer=False) except OVTypeError: - logger.warning(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.") + logger.debug(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.") return except Exception as exception: - logger.warning( + logger.debug( f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported. Exception: {exception}" ) return diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 08a9ec1f88..fcdf932a28 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import functools import importlib.util import logging import operator as op @@ -95,32 +95,6 @@ except ImportError: _openvino_available = False -_openvino_tokenizers_available = importlib.util.find_spec("openvino_tokenizers") is not None and _openvino_available -_openvino_tokenizers_version = "N/A" -if _openvino_tokenizers_available: - try: - _openvino_tokenizers_version = importlib_metadata.version("openvino_tokenizers") - except importlib_metadata.PackageNotFoundError: - _openvino_tokenizers_available = False - -if _openvino_tokenizers_available and _openvino_tokenizers_version != "N/A": - _compatible_openvino_version = next( - ( - requirement.split("==")[-1] - for requirement in importlib_metadata.requires("openvino-tokenizers") - if requirement.startswith("openvino==") - ), - "", - ) - _openvino_tokenizers_available = _compatible_openvino_version == ov_major_version - if not _openvino_tokenizers_available: - logger.warning( - "OpenVINO Tokenizer version is not compatible with OpenVINO version. " - f"Installed OpenVINO version: {ov_major_version}," - f"OpenVINO Tokenizers requires {_compatible_openvino_version}. " - f"OpenVINO Tokenizers models will not be added during export." - ) - _nncf_available = importlib.util.find_spec("nncf") is not None _nncf_version = "N/A" if _nncf_available: @@ -196,8 +170,81 @@ def is_openvino_available(): return _openvino_available +@functools.lru_cache(1) def is_openvino_tokenizers_available(): - return _openvino_tokenizers_available + if not is_openvino_available(): + return False + + if importlib.util.find_spec("openvino_tokenizers") is None: + logger.info( + "OpenVINO Tokenizers is not available. To deploy models in production " + "with C++ code, please follow installation instructions: " + "https://github.com/openvinotoolkit/openvino_tokenizers?tab=readme-ov-file#installation\n" + ) + return False + + try: + pip_metadata_version = importlib_metadata.version("openvino") + except importlib_metadata.PackageNotFoundError: + pip_metadata_version = False + try: + pip_metadata_version = importlib_metadata.version("openvino-nightly") + is_nightly = True + except importlib_metadata.PackageNotFoundError: + is_nightly = False + + try: + import openvino_tokenizers + + openvino_tokenizers._get_factory() + except RuntimeError: + tokenizers_version = openvino_tokenizers.__version__ + + if tokenizers_version == "0.0.0.0": + try: + tokenizers_version = importlib_metadata.version("openvino_tokenizers") or tokenizers_version + except importlib_metadata.PackageNotFoundError: + pass + message = ( + "OpenVINO and OpenVINO Tokenizers versions are not binary compatible.\n" + f"OpenVINO version: {_openvino_version}\n" + f"OpenVINO Tokenizers version: {tokenizers_version}\n" + "First 3 numbers should be the same. Update OpenVINO Tokenizers to compatible version. " + ) + if not pip_metadata_version: + message += ( + "For archive installation of OpenVINO try to build OpenVINO Tokenizers from source: " + "https://github.com/openvinotoolkit/openvino_tokenizers/tree/master?tab=readme-ov-file" + "#build-and-install-from-source" + ) + if sys.platform == "linux": + message += ( + "\nThe PyPI version of OpenVINO Tokenizers is built on CentOS and may not be compatible with other " + "Linux distributions; rebuild OpenVINO Tokenizers from source." + ) + else: + message += ( + "It is recommended to use the same day builds for pre-release version. " + "To install both OpenVINO and OpenVINO Tokenizers release version perform:\n" + ) + if is_nightly: + message += "pip uninstall -y openvino-nightly && " + message += "pip install --force-reinstall openvino openvino-tokenizers\n" + if is_nightly: + message += ( + "openvino-nightly package will be deprecated in the future - use pre-release drops instead. " + ) + message += "To update both OpenVINO and OpenVINO Tokenizers to the latest pre-release version perform:\n" + if is_nightly: + message += "pip uninstall -y openvino-nightly && " + message += ( + "pip install --pre -U openvino openvino-tokenizers " + "--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly" + ) + logger.warning(message) + return False + + return True def is_nncf_available(): diff --git a/setup.py b/setup.py index 3978fd1fd6..0c794aaeb5 100644 --- a/setup.py +++ b/setup.py @@ -58,13 +58,8 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] EXTRAS_REQUIRE = { - "neural-compressor": [ - "neural-compressor>=2.2.0", - "onnxruntime<1.15.0", - "accelerate", - ], - "openvino": ["openvino>=2023.3", "nncf>=2.8.1"], - "openvino-tokenizers": ["openvino-tokenizers[transformers]"], + "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], + "openvino": ["openvino>=2023.3", "nncf>=2.8.1", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.8.1"], "ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"], "diffusers": ["diffusers"], diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 7d618c530e..6e1c7a56bd 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -66,7 +66,7 @@ class OVCLIExportTestCase(unittest.TestCase): ) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { "gpt2": 2, - "t5": 0, # failed internal sentencepiece check - no token in the vocab + "t5": 0, # no .model file in the repository "albert": 0, # not supported yet "distilbert": 1, # no detokenizer "roberta": 2, @@ -125,26 +125,26 @@ def test_exporters_cli(self, task: str, model_type: str): for arch in SUPPORTED_ARCHITECTURES if not arch[0].endswith("-with-past") and not arch[1].endswith("-refiner") ) - @unittest.skipIf(not is_openvino_tokenizers_available(), reason="OpenVINO Tokenizers not available") def test_exporters_cli_tokenizers(self, task: str, model_type: str): with TemporaryDirectory() as tmpdir: output = subprocess.check_output( - f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --convert-tokenizer --task {task} {tmpdir}", + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} {tmpdir}", shell=True, stderr=subprocess.STDOUT, ).decode() - save_dir = Path(tmpdir) - number_of_tokenizers = sum("tokenizer" in file for file in map(str, save_dir.rglob("*.xml"))) - self.assertEqual( - self.EXPECTED_NUMBER_OF_TOKENIZER_MODELS[model_type], - number_of_tokenizers, - f"OVT: {is_openvino_tokenizers_available() }", - ) + if not is_openvino_tokenizers_available(): + self.assertTrue( + "OpenVINO Tokenizers is not available." in output + or "OpenVINO and OpenVINO Tokenizers versions are not binary compatible." in output, + msg=output, + ) + return + + number_of_tokenizers = sum("tokenizer" in file for file in map(str, Path(tmpdir).rglob("*.xml"))) + self.assertEqual(self.EXPECTED_NUMBER_OF_TOKENIZER_MODELS[model_type], number_of_tokenizers, output) if number_of_tokenizers == 1: self.assertTrue("Detokenizer is not supported, convert tokenizer only." in output, output) - elif number_of_tokenizers == 0 and task not in ("image-classification", "audio-classification"): - self.assertTrue(("OpenVINO Tokenizer export for" in output and "is not supported." in output), output) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_exporters_cli_fp16(self, task: str, model_type: str): From ff1d94b5522a2bdda042f02564b76686bf3add6c Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 18 Apr 2024 15:02:53 +0200 Subject: [PATCH 22/25] Remove nncf dependency from openvino configs (#668) * Remove nncf dependency from openvino configs * format * fix * fix format * Add quant_method attribute * format * set default value to quant_method attribute --- .github/workflows/test_openvino.yml | 6 +- optimum/intel/__init__.py | 21 +++--- optimum/intel/openvino/configuration.py | 67 ++++++------------- optimum/intel/openvino/quantization.py | 20 ++++-- .../utils/dummy_openvino_and_nncf_objects.py | 22 ++++++ optimum/intel/utils/dummy_openvino_objects.py | 11 --- tests/openvino/test_quantization.py | 34 +++------- 7 files changed, 82 insertions(+), 99 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index bff5cb525f..c7d20eb321 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -35,7 +35,11 @@ jobs: pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime - name: Test with Pytest run: | - pytest tests/openvino/ --ignore test_modeling_basic --durations=0 + pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0 + - name: Test basic + run: | + pip uninstall -y nncf + pytest tests/openvino/test_modeling_basic.py - name: Test openvino-nightly run: | pip uninstall -y openvino diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index c097562651..615e23801e 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -59,9 +59,13 @@ if not (is_openvino_available() and is_nncf_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - _import_structure["utils.dummy_openvino_and_nncf_objects"].extend(["OVQuantizer", "OVTrainingArguments"]) + _import_structure["utils.dummy_openvino_and_nncf_objects"].extend( + ["OVQuantizer", "OVTrainingArguments", "OVQuantizationConfig", "OVWeightQuantizationConfig"] + ) else: - _import_structure["openvino"].extend(["OVQuantizer", "OVTrainingArguments"]) + _import_structure["openvino"].extend( + ["OVQuantizer", "OVTrainingArguments", "OVQuantizationConfig", "OVWeightQuantizationConfig"] + ) try: @@ -124,8 +128,6 @@ "OVModelForVision2Seq", "OVModelForSequenceClassification", "OVModelForTokenClassification", - "OVQuantizationConfig", - "OVWeightQuantizationConfig", "OVConfig", ] ) @@ -188,9 +190,14 @@ if not (is_openvino_available() and is_nncf_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from .utils.dummy_openvino_and_nncf_objects import OVQuantizer, OVTrainingArguments + from .utils.dummy_openvino_and_nncf_objects import ( + OVQuantizationConfig, + OVQuantizer, + OVTrainingArguments, + OVWeightQuantizationConfig, + ) else: - from .openvino import OVQuantizer, OVTrainingArguments + from .openvino import OVQuantizationConfig, OVQuantizer, OVTrainingArguments, OVWeightQuantizationConfig try: if not (is_openvino_available() and is_nncf_available() and is_accelerate_available()): @@ -244,8 +251,6 @@ OVModelForSpeechSeq2Seq, OVModelForTokenClassification, OVModelForVision2Seq, - OVQuantizationConfig, - OVWeightQuantizationConfig, ) try: diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index e75301729d..1634222dd6 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -18,14 +18,17 @@ from enum import Enum from typing import Any, Dict, List, Optional, Union -import nncf import torch -from nncf.quantization.advanced_parameters import OverflowFix from transformers import PretrainedConfig from transformers.utils.quantization_config import QuantizationConfigMixin, QuantizationMethod from optimum.configuration_utils import BaseConfig +from ..utils.import_utils import is_nncf_available + + +if is_nncf_available(): + import nncf logger = logging.getLogger(__name__) @@ -52,12 +55,18 @@ } +class OVQuantizationMethod(str, Enum): + DEFAULT = "default" + + @dataclass class OVQuantizationConfigBase(QuantizationConfigMixin): """ Base configuration class for quantization parameters """ + quant_method = OVQuantizationMethod.DEFAULT + def __init__( self, ignored_scope: Optional[dict] = None, @@ -91,7 +100,7 @@ def post_init(self): if not (self.num_samples is None or isinstance(self.num_samples, int) and self.num_samples > 0): raise ValueError(f"`num_samples` is expected to be a positive integer, but found: {self.num_samples}") - def get_ignored_scope_instance(self) -> nncf.IgnoredScope: + def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": if self.ignored_scope is None: return nncf.IgnoredScope() return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) @@ -178,10 +187,6 @@ def to_diff_dict(self) -> Dict[str, Any]: return self._to_dict_safe(to_diff_dict=True) -class OVQuantizationMethod(str, Enum): - DEFAULT = "default" - - @dataclass class OVWeightQuantizationConfig(OVQuantizationConfigBase): """ @@ -240,7 +245,7 @@ def __init__( sensitivity_metric: Optional[str] = None, ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, - quant_method: Optional[Union[QuantizationMethod, OVQuantizationMethod]] = OVQuantizationMethod.DEFAULT, + quant_method: Union[QuantizationMethod, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT, weight_only: Optional[bool] = True, **kwargs, ): @@ -309,12 +314,12 @@ def post_init(self): class OVQuantizationConfig(OVQuantizationConfigBase): def __init__( self, + sym: bool = False, ignored_scope: Optional[dict] = None, num_samples: Optional[int] = 300, - preset: nncf.QuantizationPreset = None, - model_type: nncf.ModelType = nncf.ModelType.TRANSFORMER, + model_type: str = "transformer", fast_bias_correction: bool = True, - overflow_fix: OverflowFix = OverflowFix.DISABLE, + overflow_fix: str = "disable", weight_only: Optional[bool] = False, **kwargs, ): @@ -323,23 +328,18 @@ def __init__( compression, during quantization both weights and activations are converted to lower precision. For weight-only model quantization please see OVWeightQuantizationConfig. Args: + sym (`bool`, defaults to `False`): + Whether to use symmetric quantization on the activations. Symmetric quantization will be applied on the weights in any case. ignored_scope (`dict`, *optional*): An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. - preset (`nncf.QuantizationPreset`, *optional*): - A preset controls the quantization mode (symmetric and asymmetric). - It can take the following values: - - `performance`: Symmetric quantization of weights and activations. - - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations. - Default value is None. In this case, `mixed` preset is used for `transformer` - model type otherwise `performance`. - model_type (`nncf.ModelType`, defaults to nncf.ModelType.TRANSFORMER): + model_type (`str`, defaults to "transformer"): Model type is needed to specify additional patterns in the model. Supported only `transformer` now. fast_bias_correction (`bool`, defaults to True): Whether to apply fast or full bias correction algorithm. - overflow_fix (`nncf.OverflowFix`, default to OverflowFix.DISABLE): + overflow_fix (`str`, default to "disable"): Parameter for controlling overflow fix setting. weight_only (`bool`, *optional*): Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building @@ -351,37 +351,12 @@ def __init__( "Please check your configuration." ) super().__init__(ignored_scope, num_samples, False) - # TODO: remove checks below once NNCF is updated to 2.10 - if isinstance(overflow_fix, str): - overflow_fix = OverflowFix(overflow_fix) - if isinstance(preset, str): - preset = nncf.QuantizationPreset(preset) - - self.preset = preset + self.sym = sym self.model_type = model_type self.fast_bias_correction = fast_bias_correction self.overflow_fix = overflow_fix self.post_init() - def to_dict(self) -> Dict[str, Any]: - # TODO: remove code below once NNCF is updated to 2.10 - if isinstance(self.overflow_fix, Enum) or isinstance(self.preset, Enum): - overflow_fix_value = ( - None - if self.overflow_fix is None - else self.overflow_fix - if isinstance(self.overflow_fix, str) - else self.overflow_fix.value - ) - preset_value = ( - None if self.preset is None else self.preset if isinstance(self.preset, str) else self.preset.value - ) - self_copy = copy.deepcopy(self) - self_copy.overflow_fix = overflow_fix_value - self_copy.preset = preset_value - return self_copy.to_dict() - return super().to_dict() - def _check_default_4bit_configs(config: PretrainedConfig): return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 33985dbe6e..a2bef7aab3 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -26,7 +26,7 @@ import torch import transformers from nncf import CompressWeightsMode, SensitivityMetric -from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters +from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix from nncf.torch import register_module from nncf.torch.initialization import PTInitializingDataLoader from openvino._offline_transformations import compress_quantize_weights_transformation @@ -378,10 +378,12 @@ def _quantize_ovbasemodel( quantization_dataset, subset_size=quantization_config.num_samples, ignored_scope=quantization_config.get_ignored_scope_instance(), - model_type=quantization_config.model_type, - preset=quantization_config.preset, + model_type=nncf.ModelType(quantization_config.model_type), + preset=nncf.QuantizationPreset.PERFORMANCE if quantization_config.sym else nncf.QuantizationPreset.MIXED, fast_bias_correction=quantization_config.fast_bias_correction, - advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), + advanced_parameters=nncf.AdvancedQuantizationParameters( + overflow_fix=OverflowFix(quantization_config.overflow_fix) + ), **kwargs, ) self.model.model = quantized_model @@ -476,10 +478,14 @@ def _quantize_torchmodel( quantization_dataset, subset_size=quantization_config.num_samples, ignored_scope=quantization_config.get_ignored_scope_instance(), - model_type=quantization_config.model_type, - preset=quantization_config.preset, + model_type=nncf.ModelType(quantization_config.model_type), + preset=nncf.QuantizationPreset.PERFORMANCE + if quantization_config.sym + else nncf.QuantizationPreset.MIXED, fast_bias_correction=quantization_config.fast_bias_correction, - advanced_parameters=nncf.AdvancedQuantizationParameters(overflow_fix=quantization_config.overflow_fix), + advanced_parameters=nncf.AdvancedQuantizationParameters( + overflow_fix=OverflowFix(quantization_config.overflow_fix) + ), **kwargs, ) diff --git a/optimum/intel/utils/dummy_openvino_and_nncf_objects.py b/optimum/intel/utils/dummy_openvino_and_nncf_objects.py index 8ae3135667..e646074e1e 100644 --- a/optimum/intel/utils/dummy_openvino_and_nncf_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_nncf_objects.py @@ -46,3 +46,25 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "nncf"]) + + +class OVWeightQuantizationConfig(metaclass=DummyObject): + _backends = ["openvino", "nncf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "nncf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "nncf"]) + + +class OVQuantizationConfig(metaclass=DummyObject): + _backends = ["openvino", "nncf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "nncf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "nncf"]) diff --git a/optimum/intel/utils/dummy_openvino_objects.py b/optimum/intel/utils/dummy_openvino_objects.py index d5e42851da..5af3222d86 100644 --- a/optimum/intel/utils/dummy_openvino_objects.py +++ b/optimum/intel/utils/dummy_openvino_objects.py @@ -189,14 +189,3 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino"]) - - -class OVWeightQuantizationConfig(metaclass=DummyObject): - _backends = ["openvino"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["openvino"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["openvino"]) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b22d5e3955..e269578c35 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -748,10 +748,10 @@ class OVQuantizationConfigTest(unittest.TestCase): OVQuantizationConfig( ignored_scope={"names": ["op_name"]}, num_samples=100, - preset=nncf.QuantizationPreset.MIXED, - model_type=nncf.ModelType.TRANSFORMER, + sym=False, + model_type="transformer", fast_bias_correction=True, - overflow_fix=OverflowFix.DISABLE, + overflow_fix="disable", ), ), (OVQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])),), @@ -789,15 +789,15 @@ class OVQuantizationConfigTest(unittest.TestCase): OVWeightQuantizationConfig, "Can't determine type of OV quantization config", ), - (dict(model_type=nncf.ModelType.TRANSFORMER), OVQuantizationConfig, None), + (dict(model_type="transformer"), OVQuantizationConfig, None), ( dict( ignored_scope={"names": ["op_name"]}, num_samples=100, - preset=nncf.QuantizationPreset.MIXED, - model_type=nncf.ModelType.TRANSFORMER, + sym=False, + model_type="transformer", fast_bias_correction=True, - overflow_fix=OverflowFix.DISABLE, + overflow_fix="disable", ), OVQuantizationConfig, None, @@ -809,21 +809,11 @@ class OVQuantizationConfigTest(unittest.TestCase): (dict(bits=8, fast_bias_correction=True, weight_only=True), OVWeightQuantizationConfig, None), (dict(bits=8, fast_bias_correction=True, weight_only=False), OVQuantizationConfig, None), (dict(bits=8, sym=True, weight_only=False), OVWeightQuantizationConfig, "Please check your configuration"), - ( - dict(model_type=nncf.ModelType.TRANSFORMER, weight_only=True), - OVQuantizationConfig, - "Please check your configuration", - ), + (dict(model_type="transformer", weight_only=True), OVQuantizationConfig, "Please check your configuration"), ) @parameterized.expand(QUANTIZATION_CONFIGS) def test_config_serialization(self, quantization_config: OVQuantizationConfigBase): - def str_to_enum(enum_cls, value): - for k, v in enum_cls.__members__.items(): - if getattr(enum_cls, k).value == value: - return v - raise ValueError(f"Could not convert string {value} to enum value of type {enum_cls}") - ov_config = OVConfig(quantization_config=quantization_config) with tempfile.TemporaryDirectory() as tmp_dir: ov_config.save_pretrained(tmp_dir) @@ -834,14 +824,6 @@ def str_to_enum(enum_cls, value): return for key, value in loaded_ov_config.quantization_config.to_dict().items(): initial_value = getattr(ov_config.quantization_config, key) - if key == "preset" or key == "overflow_fix": - # TODO: remove once NNCF is updated to 2.10 - if getattr(quantization_config, key) is not None: - self.assertTrue(isinstance(value, str)) - if key == "preset": - value = str_to_enum(nncf.QuantizationPreset, value) - else: - value = str_to_enum(OverflowFix, value) self.assertEqual(value, initial_value) @parameterized.expand(QUANTIZATION_CONFIG_DICTS) From f392c9b1d51e722cbbe1c1498fd66b29d77b5249 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Fri, 19 Apr 2024 11:44:26 +0200 Subject: [PATCH 23/25] Custom tasks modeling (#669) * added custom tasks modeling * patched output names for now and added vit with a attentions test * test passing * fix attentions * added hidden states test * remove unnecessary names processing * better testing * added inputs check Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * added a bert with pooler * fix name * added a custom export test * better custom config --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/intel/__init__.py | 4 +- optimum/intel/openvino/__init__.py | 1 + optimum/intel/openvino/modeling.py | 64 ++++++++++++++++++++++ tests/openvino/test_export.py | 41 +++++++++++++- tests/openvino/test_modeling.py | 85 ++++++++++++++++++++++++++++++ tests/openvino/utils_tests.py | 2 + 6 files changed, 195 insertions(+), 2 deletions(-) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 615e23801e..f9234cb3b1 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -116,8 +116,9 @@ "OVModelForAudioClassification", "OVModelForAudioFrameClassification", "OVModelForAudioXVector", - "OVModelForCTC", "OVModelForCausalLM", + "OVModelForCTC", + "OVModelForCustomTasks", "OVModelForFeatureExtraction", "OVModelForImageClassification", "OVModelForMaskedLM", @@ -242,6 +243,7 @@ OVModelForAudioXVector, OVModelForCausalLM, OVModelForCTC, + OVModelForCustomTasks, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 0cd7d8a029..b871668588 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -49,6 +49,7 @@ OVModelForAudioFrameClassification, OVModelForAudioXVector, OVModelForCTC, + OVModelForCustomTasks, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index 8a816609fa..9c7c2b5258 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -43,6 +43,7 @@ CausalLMOutput, ImageClassifierOutput, MaskedLMOutput, + ModelOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, @@ -953,3 +954,66 @@ def forward( logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] return TokenClassifierOutput(logits=logits) + + +CUSTOM_TASKS_EXAMPLE = """ + Example of custom tasks (e.g. a sentence transformers with a pooler head): + + ```python + >>> from transformers import {processor_class} + >>> from optimum.intel import {model_class} + + >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}") + + >>> inputs = tokenizer("I love burritos!", return_tensors="np") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooler_output = outputs.pooler_output + ``` +""" + + +@add_start_docstrings( + """ + OpenVINO Model for custom tasks. It can be used to leverage the inference acceleration for any single-file OpenVINO model, that may use custom inputs and outputs. + """, + MODEL_START_DOCSTRING, +) +class OVModelForCustomTasks(OVModel): + @add_start_docstrings_to_model_forward( + CUSTOM_TASKS_EXAMPLE.format( + processor_class=_TOKENIZER_FOR_DOC, + model_class="OVModelForCustomTasks", + checkpoint="IlyasMoutawwakil/sbert-all-MiniLM-L6-v2-with-pooler", + ) + ) + def forward(self, **kwargs): + expected_inputs_names = set(self.input_names) + inputs_names = set(kwargs) + + if not expected_inputs_names.issubset(inputs_names): + raise ValueError( + f"Got unexpected inputs: expecting the following inputs : {', '.join(expected_inputs_names)} but got : {', '.join(inputs_names)}." + ) + + np_inputs = isinstance(next(iter(kwargs.values())), np.ndarray) + inputs = {} + for input_name in self.input_names: + inputs[input_name] = np.array(kwargs.pop(input_name)) if not np_inputs else kwargs.pop(input_name) + + outputs = self.request(inputs) + + model_outputs = {} + for key, value in outputs.items(): + key_name = next(iter(key.names)) + if "." in key_name: + key_name = key_name.split(".")[0] + if key_name not in model_outputs: + model_outputs[key_name] = [] + model_outputs[key_name].append(torch.from_numpy(value).to(self.device) if not np_inputs else value) + else: + model_outputs[key_name] = torch.from_numpy(value).to(self.device) if not np_inputs else value + + return ModelOutput(**model_outputs) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 21bec021f8..9d1daaab63 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -19,15 +19,18 @@ from typing import Optional from parameterized import parameterized +from transformers import AutoConfig from utils_tests import MODEL_NAMES from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED -from optimum.exporters.openvino import export_from_model +from optimum.exporters.onnx.model_configs import BertOnnxConfig +from optimum.exporters.openvino import export_from_model, main_export from optimum.exporters.tasks import TasksManager from optimum.intel import ( OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, + OVModelForCustomTasks, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, @@ -114,3 +117,39 @@ def _openvino_export( @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_export(self, model_type: str): self._openvino_export(model_type) + + +class CustomExportModelTest(unittest.TestCase): + def test_export_custom_model(self): + class BertOnnxConfigWithPooler(BertOnnxConfig): + @property + def outputs(self): + if self.task == "feature-extraction-with-pooler": + common_outputs = {} + common_outputs["last_hidden_state"] = {0: "batch_size", 1: "sequence_length"} + common_outputs["pooler_output"] = {0: "batch_size"} + else: + common_outputs = super().outputs + + return common_outputs + + base_task = "feature-extraction" + custom_task = f"{base_task}-with-pooler" + model_id = "sentence-transformers/all-MiniLM-L6-v2" + + config = AutoConfig.from_pretrained(model_id) + custom_export_configs = {"model": BertOnnxConfigWithPooler(config, task=custom_task)} + + with TemporaryDirectory() as tmpdirname: + main_export( + model_name_or_path=model_id, + custom_export_configs=custom_export_configs, + library_name="transformers", + output=Path(tmpdirname), + task=base_task, + ) + + ov_model = OVModelForCustomTasks.from_pretrained(tmpdirname) + + self.assertIsInstance(ov_model, OVBaseModel) + self.assertTrue(ov_model.output_names == {"last_hidden_state": 0, "pooler_output": 1}) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 907c767310..f84cac8161 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -63,6 +63,7 @@ OVModelForAudioXVector, OVModelForCausalLM, OVModelForCTC, + OVModelForCustomTasks, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, @@ -1525,3 +1526,87 @@ def test_pipeline_image_to_text(self, model_arch: str): self.assertIsInstance(outputs[0]["generated_text"], str) gc.collect() + + +class OVModelForCustomTasksIntegrationTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES_WITH_ATTENTION = ["vit-with-attentions"] + SUPPORTED_ARCHITECTURES_WITH_HIDDEN_STATES = ["vit-with-hidden-states"] + + def _get_sample_image(self): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + return image + + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_ATTENTION) + def test_compare_output_attentions(self, model_arch): + model_id = MODEL_NAMES[model_arch] + + image = self._get_sample_image() + preprocessor = AutoFeatureExtractor.from_pretrained(model_id) + inputs = preprocessor(images=image, return_tensors="pt") + + transformers_model = AutoModelForImageClassification.from_pretrained(model_id) + transformers_model.eval() + with torch.no_grad(): + transformers_outputs = transformers_model(**inputs, output_attentions=True) + + ov_model = OVModelForCustomTasks.from_pretrained(model_id, ov_config=F32_CONFIG) + self.assertIsInstance(ov_model.config, PretrainedConfig) + + for input_type in ["pt", "np"]: + inputs = preprocessor(images=image, return_tensors=input_type) + ov_outputs = ov_model(**inputs) + self.assertIn("logits", ov_outputs) + self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) + self.assertTrue(len(ov_outputs.attentions) == len(transformers_outputs.attentions)) + for i in range(len(ov_outputs.attentions)): + self.assertTrue( + torch.allclose( + torch.Tensor(ov_outputs.attentions[i]), + transformers_outputs.attentions[i], + atol=1e-4, # attentions are accurate + rtol=1e-4, # attentions are accurate + ), + f"Attention mismatch at layer {i}", + ) + + del transformers_model + del ov_model + gc.collect() + + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HIDDEN_STATES) + def test_compare_output_hidden_states(self, model_arch): + model_id = MODEL_NAMES[model_arch] + + image = self._get_sample_image() + preprocessor = AutoFeatureExtractor.from_pretrained(model_id) + inputs = preprocessor(images=image, return_tensors="pt") + + transformers_model = AutoModelForImageClassification.from_pretrained(model_id) + transformers_model.eval() + with torch.no_grad(): + transformers_outputs = transformers_model(**inputs, output_hidden_states=True) + + ov_model = OVModelForCustomTasks.from_pretrained(model_id, ov_config=F32_CONFIG) + self.assertIsInstance(ov_model.config, PretrainedConfig) + for input_type in ["pt", "np"]: + inputs = preprocessor(images=image, return_tensors=input_type) + ov_outputs = ov_model(**inputs) + self.assertIn("logits", ov_outputs) + self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) + self.assertTrue(len(ov_outputs.hidden_states) == len(transformers_outputs.hidden_states)) + for i in range(len(ov_outputs.hidden_states)): + self.assertTrue( + torch.allclose( + torch.Tensor(ov_outputs.hidden_states[i]), + transformers_outputs.hidden_states[i], + atol=1e-3, # hidden states are less accurate + rtol=1e-2, # hidden states are less accurate + ), + f"Hidden states mismatch at layer {i}", + ) + del transformers_model + del ov_model + gc.collect() diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 73224c81b2..c610479dd7 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -100,6 +100,8 @@ "unispeech": "hf-internal-testing/tiny-random-unispeech", "unispeech_sat": "hf-internal-testing/tiny-random-UnispeechSatModel", "vit": "hf-internal-testing/tiny-random-vit", + "vit-with-attentions": "IlyasMoutawwakil/vit-with-attentions", + "vit-with-hidden-states": "IlyasMoutawwakil/vit-with-hidden_states", "vision-encoder-decoder": "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2", "wavlm": "hf-internal-testing/tiny-random-WavlmModel", "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", From e6e5ffdb8482b858326f0bb23255ae4fef069d8a Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 19 Apr 2024 17:03:22 +0400 Subject: [PATCH 24/25] Fix sentence transformers CLI export (#674) * fix sentence transformers export after latest changes merge * add test case * Update optimum/commands/export/openvino.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/commands/export/openvino.py | 8 +++++++- setup.py | 1 + tests/openvino/test_exporters_cli.py | 28 ++++++++++++++++++++++++++++ tests/openvino/utils_tests.py | 1 + 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 3015d7b5b5..cdae847468 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -214,7 +214,13 @@ def run(self): quantization_config["dataset"] = self.args.dataset ov_config = OVConfig(quantization_config=quantization_config) - library_name = TasksManager.infer_library_from_model(self.args.model) + library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library) + if library_name == "sentence_transformers" and self.args.library is None: + logger.warning( + "Library name is not specified. There are multiple possible variants: `sentence_transformers`, `transformers`." + "`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers" + ) + library_name = "transformers" if ( library_name == "diffusers" diff --git a/setup.py b/setup.py index 0c794aaeb5..ea87e6ad59 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ "transformers_stream_generator", "einops", "tiktoken", + "sentence_transformers", ] QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 6e1c7a56bd..09fad5d773 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -218,3 +218,31 @@ def test_exporters_cli_help(self): shell=True, check=True, ) + + def test_exporters_cli_sentence_transformers(self): + model_id = MODEL_NAMES["bge"] + with TemporaryDirectory() as tmpdir: + # default export creates transformers model + subprocess.run( + f"optimum-cli export openvino --model {model_id} --task feature-extraction {tmpdir}", + shell=True, + check=True, + ) + model = eval(_HEAD_TO_AUTOMODELS["feature-extraction"]).from_pretrained(tmpdir, compile=False) + self.assertTrue("last_hidden_state" in model.output_names) + # export with transformers lib creates transformers model + subprocess.run( + f"optimum-cli export openvino --model {model_id} --task feature-extraction --library transformers {tmpdir}", + shell=True, + check=True, + ) + model = eval(_HEAD_TO_AUTOMODELS["feature-extraction"]).from_pretrained(tmpdir, compile=False) + self.assertTrue("last_hidden_state" in model.output_names) + # export with sentence_transformers lib creates sentence_transformers model + subprocess.run( + f"optimum-cli export openvino --model {model_id} --task feature-extraction --library sentence_transformers {tmpdir}", + shell=True, + check=True, + ) + model = eval(_HEAD_TO_AUTOMODELS["feature-extraction"]).from_pretrained(tmpdir, compile=False) + self.assertFalse("last_hidden_state" in model.output_names) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index c610479dd7..ca56f6d552 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -19,6 +19,7 @@ MODEL_NAMES = { "albert": "hf-internal-testing/tiny-random-albert", "audio_spectrogram_transformer": "Ericwang/tiny-random-ast", + "bge": "BAAI/bge-small-en-v1.5", "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", "bert": "hf-internal-testing/tiny-random-bert", "bart": "hf-internal-testing/tiny-random-bart", From 673b88bc9ec0df8ae8c35ea1cada32a9daf17a25 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 19 Apr 2024 15:04:44 +0200 Subject: [PATCH 25/25] Update OV quantization docs and QA notebook according to the recent changes (#671) * Fix quantization call in QA notebook * Update OV quantization docs * Apply PTQ if quantization config was not provided, but calibration dataset was provided * Add warning --- README.md | 5 +++-- docs/source/optimization_ov.mdx | 5 +++-- .../openvino/question_answering_quantization.ipynb | 13 +++++++------ optimum/intel/openvino/quantization.py | 7 ++++++- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 41537d8971..49f0d79768 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ Post-training static quantization introduces an additional calibration step wher ```python from functools import partial -from optimum.intel import OVQuantizer, OVModelForSequenceClassification +from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig from transformers import AutoTokenizer, AutoModelForSequenceClassification model_id = "distilbert-base-uncased-finetuned-sst-2-english" @@ -151,7 +151,8 @@ calibration_dataset = quantizer.get_calibration_dataset( # The directory where the quantized model will be saved save_dir = "nncf_results" # Apply static quantization and save the resulting model in the OpenVINO IR format -quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir) +ov_config = OVConfig(quantization_config=OVQuantizationConfig()) +quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir) # Load the quantized model optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir) ``` diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 1e78c36805..e018134964 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -84,7 +84,7 @@ Here is how to apply static quantization on a fine-tuned DistilBERT given your o ```python from transformers import AutoTokenizer -from optimum.intel import OVQuantizer, OVModelForSequenceClassification, +from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig model_id = "distilbert-base-uncased-finetuned-sst-2-english" model = OVModelForSequenceClassification.from_pretrained(model_id, export=True) @@ -95,7 +95,8 @@ save_dir = "ptq_model" quantizer = OVQuantizer.from_pretrained(model) # Apply static quantization and export the resulting quantized model to OpenVINO IR format -quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir) +ov_config = OVConfig(quantization_config=OVQuantizationConfig()) +quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir) # Save the tokenizer tokenizer.save_pretrained(save_dir) ``` diff --git a/notebooks/openvino/question_answering_quantization.ipynb b/notebooks/openvino/question_answering_quantization.ipynb index ba4a84ca38..196e3ba6a7 100644 --- a/notebooks/openvino/question_answering_quantization.ipynb +++ b/notebooks/openvino/question_answering_quantization.ipynb @@ -51,7 +51,7 @@ "import transformers\n", "from evaluate import evaluator\n", "from openvino.runtime import Core\n", - "from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer\n", + "from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer, OVQuantizationConfig, OVConfig\n", "from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n", "\n", "transformers.logging.set_verbosity_error()\n", @@ -286,11 +286,11 @@ "**NOTE:** if you notice very low accuracy after post-training quantization, it is likely caused by an overflow issue which affects processors that do not contain VNNI (Vector Neural Network Instruction). NNCF has an `overflow_fix` option to address this. It will effectively use 7-bits for quantizing instead of 8-bits to prevent the overflow. To use this option, modify the code in the next cell to add an explicit quantization configuration, and set `overflow_fix` to `\"enable\"`:\n", "\n", "```\n", - "from optimum.intel.openvino import OVConfig\n", + "from optimum.intel.openvino import OVConfig, OVQuantizationConfig\n", "\n", - "ov_config = OVConfig()\n", - "ov_config.compression[\"overflow_fix\"] = \"enable\"\n", - "quantizer = OVQuantizer.from_pretrained(model, ov_config=ov_config)\n", + "ov_config = OVConfig(quantization_config=OVQuantizationConfig(overflow_fix=\"enable\")\n", + "quantizer = OVQuantizer.from_pretrained(model)\n", + "quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path, ov_config=ov_config)\n", "```\n", "\n", "For more information, see [Lower Numerical Precision Deep Learning Inference and Training](https://www.intel.com/content/www/us/en/developer/articles/technical/lower-numerical-precision-deep-learning-inference-and-training.html)" @@ -317,7 +317,8 @@ "\n", "# Quantize the model\n", "quantizer = OVQuantizer.from_pretrained(model)\n", - "quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path)" + "ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n", + "quantizer.quantize(calibration_dataset=train_dataset, ov_config=ov_config, save_directory=int8_ptq_model_path)" ] }, { diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index a2bef7aab3..aae66c148b 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -280,13 +280,18 @@ def quantize( raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") quantization_config = ov_config.quantization_config if quantization_config is None: - if weights_only is None or weights_only is True: + if (weights_only is None or weights_only is True) and calibration_dataset is None: if weights_only is None: logger.info( "`quantization_config` was not provided, 8-bit asymmetric weight quantization will be applied." ) ov_config.quantization_config = OVWeightQuantizationConfig(bits=8) else: + logger.warning( + "`quantization_config` was not provided, but calibration dataset was provided, assuming full " + "model quantization is intended. In the future, please provide `quantization_config` as an " + "instance of OVQuantizationConfig." + ) ov_config.quantization_config = OVQuantizationConfig() if isinstance(self.model, OVBaseModel):