diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index 16c01e7298..1a8cb28bab 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -32,11 +32,17 @@ jobs: python -m pip install --upgrade pip pip install cmake pip install py-cpuinfo - pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] - pip install intel-extension-for-pytorch==2.1.100 - pip install intel-extension-for-transformers==1.3.2 + pip install intel-extension-for-transformers pip install peft + - name: Test with Pytest run: | - pytest tests/neural_compressor/ + pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0 + - name: Test IPEX + run: | + pip uninstall -y intel-extension-for-transformers + pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu + pip install intel-extension-for-pytorch==2.1.100 + pytest tests/neural_compressor/test_ipex.py + diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index ba5b09ff81..c7d20eb321 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -35,7 +35,11 @@ jobs: pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime - name: Test with Pytest run: | - pytest tests/openvino/ --ignore test_modeling_basic + pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0 + - name: Test basic + run: | + pip uninstall -y nncf + pytest tests/openvino/test_modeling_basic.py - name: Test openvino-nightly run: | pip uninstall -y openvino diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml index effb99a84d..3135e6c004 100644 --- a/.github/workflows/test_openvino_basic.yml +++ b/.github/workflows/test_openvino_basic.yml @@ -25,7 +25,7 @@ jobs: # Testing lower and upper bound of supported Python versions # This also ensures that the test fails if dependencies break for Python 3.7 python-version: ["3.8", "3.11"] - transformers: ['transformers', 'git+https://github.com/huggingface/transformers.git'] + transformers: ['transformers'] optimum: ['optimum', 'git+https://github.com/huggingface/optimum.git'] runs-on: ubuntu-20.04 @@ -42,7 +42,7 @@ jobs: # Install openvino manually to prevent dependency conflicts when .[openvino] pins # optimum or transformers to a specific version # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages - pip install torch --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu pip install .[tests] openvino onnx onnxruntime ${{ matrix.optimum}} ${{ matrix.transformers }} - name: Pip freeze @@ -51,4 +51,4 @@ jobs: - name: Test with Pytest run: | pytest tests/openvino/test_modeling_basic.py - + RUN_SLOW=1 pytest tests/openvino/test_modeling.py -s -m "run_slow" --durations=0 \ No newline at end of file diff --git a/Makefile b/Makefile index 83035cf467..69e103466d 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ build_doc_docker_image: doc: build_doc_docker_image @test -n "$(BUILD_DIR)" || (echo "BUILD_DIR is empty." ; exit 1) @test -n "$(VERSION)" || (echo "VERSION is empty." ; exit 1) - docker run -v $(CURRENT_DIR):/doc_folder --workdir=/doc_folder doc_maker \ + docker run -v $(CURRENT_DIR):/doc_folder --workdir=/doc_folder --env CI=$(CI) doc_maker \ doc-builder build optimum.intel /optimum-intel/docs/source/ \ --repo_name optimum-intel \ --build_dir $(BUILD_DIR) \ diff --git a/README.md b/README.md index c29a923745..49f0d79768 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,9 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi | Accelerator | Installation | |:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------| -| [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"` | -| [OpenVINO](https://docs.openvino.ai) | `pip install --upgrade-strategy eager "optimum[openvino]"` | -| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade-strategy eager "optimum[ipex]"` | +| [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"` | +| [OpenVINO](https://docs.openvino.ai) | `pip install --upgrade --upgrade-strategy eager "optimum[openvino]"` | +| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade --upgrade-strategy eager "optimum[ipex]"` | The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. @@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a optimum-cli export openvino --model gpt2 ov_model ``` -You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision. +You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision. ```plain optimum-cli export openvino --model gpt2 --weight-format int8 ov_model ``` +Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers. + +```plain +optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model +``` + To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov). #### Inference: @@ -122,7 +128,7 @@ Post-training static quantization introduces an additional calibration step wher ```python from functools import partial -from optimum.intel import OVQuantizer, OVModelForSequenceClassification +from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig from transformers import AutoTokenizer, AutoModelForSequenceClassification model_id = "distilbert-base-uncased-finetuned-sst-2-english" @@ -145,7 +151,8 @@ calibration_dataset = quantizer.get_calibration_dataset( # The directory where the quantized model will be saved save_dir = "nncf_results" # Apply static quantization and save the resulting model in the OpenVINO IR format -quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir) +ov_config = OVConfig(quantization_config=OVQuantizationConfig()) +quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir) # Load the quantized model optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir) ``` diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index c29f5ceb95..aaab1b1f83 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -18,10 +18,10 @@ limitations under the License. To install the latest release of 🤗 Optimum Intel with the corresponding required dependencies, you can do respectively: -| Accelerator | Installation | -|:-----------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------| -| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"`| -| [Intel OpenVINO](https://docs.openvino.ai ) | `pip install --upgrade-strategy eager "optimum[openvino]"` | +| Accelerator | Installation | +|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------| +| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"`| +| [Intel OpenVINO](https://docs.openvino.ai ) | `pip install --upgrade --upgrade-strategy eager "optimum[openvino]"` | The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 1e78c36805..e018134964 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -84,7 +84,7 @@ Here is how to apply static quantization on a fine-tuned DistilBERT given your o ```python from transformers import AutoTokenizer -from optimum.intel import OVQuantizer, OVModelForSequenceClassification, +from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig model_id = "distilbert-base-uncased-finetuned-sst-2-english" model = OVModelForSequenceClassification.from_pretrained(model_id, export=True) @@ -95,7 +95,8 @@ save_dir = "ptq_model" quantizer = OVQuantizer.from_pretrained(model) # Apply static quantization and export the resulting quantized model to OpenVINO IR format -quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir) +ov_config = OVConfig(quantization_config=OVQuantizationConfig()) +quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir) # Save the tokenizer tokenizer.save_pretrained(save_dir) ``` diff --git a/examples/neural_compressor/language-modeling/run_clm.py b/examples/neural_compressor/language-modeling/run_clm.py index ef24616307..1799ad6782 100644 --- a/examples/neural_compressor/language-modeling/run_clm.py +++ b/examples/neural_compressor/language-modeling/run_clm.py @@ -64,8 +64,7 @@ if is_intel_extension_for_transformers_available(): - from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig - + from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -227,8 +226,9 @@ class OptimizationArguments: metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."}, ) quantization_methodology: str = field( - default="RTN", - metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."}, + choices=["rtn", "gptq"], + default="rtn", + metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."}, ) damp_percent: float = field( default=0.01, @@ -662,22 +662,23 @@ def compute_metrics(eval_preds): raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization")) if optim_args.apply_pruning or optim_args.apply_distillation: raise ValueError("Weight only quantization and pruning or distillation cannot be combined.") - if optim_args.quantization_methodology == "GPTQ": - algorithm_args = { - "act_order": False, - "percdamp": optim_args.damp_percent, - "block_size": optim_args.gptq_block_size, - "nsamples": optim_args.num_calibration_samples, - "use_max_length": optim_args.use_max_length, - "pad_max_length": optim_args.pad_max_length, - } - quantization_config = WeightOnlyQuantConfig( - weight_dtype=optim_args.weight_dtype, - group_size=optim_args.group_size, - scheme=optim_args.weight_only_scheme, - algorithm=optim_args.quantization_methodology, - algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None, - ) + + algorithm_args = { + "weight_dtype": optim_args.weight_dtype, + "sym": optim_args.weight_only_scheme == "sym", + "group_size": optim_args.group_size, + } + + if optim_args.quantization_methodology == "gptq": + quantization_config = GPTQConfig( + damp_percent=optim_args.damp_percent, + nsamples=optim_args.num_calibration_samples, + blocksize=optim_args.gptq_block_size, + **algorithm_args, + ) + else: + quantization_config = RtnConfig(**algorithm_args) + else: quantization_config = PostTrainingQuantConfig( approach=optim_args.quantization_approach, recipes=recipes diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb index 582b463346..7671064088 100644 --- a/notebooks/openvino/quantized_generation_demo.ipynb +++ b/notebooks/openvino/quantized_generation_demo.ipynb @@ -223,14 +223,346 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "5179c53d-0436-4ee9-9367-2625a8d3e262", + "metadata": {}, + "source": [ + "## Assisted generation\n", + "Auto-regressive language models generate outputs token by token. Assisted generation (AG) is a general name for a group of methods that speculate the next generated tokens and then use the language model to validate the speculated tokens and accept/reject them.\n", + "AG is a great method to accelerate LMs running locally on your computer as it reduces memory bandwidth requirements and can speedup generation by 1.5x-3x without any accuracy degradation.\n", + "You can read more on assisted generation here in this great [blog post](https://huggingface.co/blog/assisted-generation).\n", + "\n", + "\n", + "In this section we will present how to run Phi-2 with two AG methods that are well supported within 🤗 transformers: Prompt Lookahead Decoding (PLD) and Speculative Decoding.\n", + "\n", + "To use Phi-2 with AG we will need to export the model again with `stateful=False` as OpenVINO stateful models don't support speculative decoding yet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adc4484f-8234-4206-9f28-7a02a7444e25", + "metadata": {}, + "outputs": [], + "source": [ + "# Save the model in a different directory to set it apart from the stateful model\n", + "save_name = model_name.split(\"/\")[-1] + \"_openvino_stateless\"\n", + "\n", + "load_kwargs[\"ov_config\"][\"CACHE_DIR\"] = os.path.join(save_name, \"model_cache\")\n", + "\n", + "# Check whether the model was already exported\n", + "saved = os.path.exists(save_name)\n", + "\n", + "# We can use the same loading attributes, the only differece is the stateful attribute\n", + "stateless_model = OVModelForCausalLM.from_pretrained(\n", + " model_name if not saved else save_name,\n", + " export=not saved,\n", + " stateful=False,\n", + " **load_kwargs,\n", + ")\n", + "\n", + "# Save the exported model locally\n", + "if not saved:\n", + " stateless_model.save_pretrained(save_name)\n", + " tokenizer.save_pretrained(save_name)\n", + "\n", + "stateless_model.compile()" + ] + }, + { + "cell_type": "markdown", + "id": "98d34b03-55e0-4606-be26-5722d6868679", + "metadata": {}, + "source": [ + "### Prompt lookahead decoding\n", + "Now we will run the same example from before with PLD enabled. \n", + "PLD speculates tokens by searching the last n-gram (usually 2-gram) in the sequence inside the prompt, if we find a match, we will take the next few tokens (configured with `prompt_lookup_num_tokens`) as our speculation, if a match is not found the code will revert back to auto-regressive generation.\n", + "\n", + "We will run the same example from before with PLD. To enable PLD, we simply pass the `prompt_lookup_num_tokens` key-word argument to the `generate` function.\n", + "Note that PLD can be great when doing code completion as some sequences of tokens tend to repeat themselves in the same order, names of variables, like `for i in range(...):`, etc.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63a2c7f3-3417-4dec-981d-e99387cc18a8", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import TextStreamer\n", + "\n", + "\n", + "# Tokenize the sample\n", + "inputs = tokenizer([sample], return_tensors='pt') \n", + "\n", + "out = stateless_model.generate(\n", + " **inputs,\n", + " max_new_tokens=128,\n", + " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " prompt_lookup_num_tokens=3,\n", + ") " + ] + }, + { + "cell_type": "markdown", + "id": "f0e4e211-e721-48bf-a73f-c987fd3321d3", + "metadata": {}, + "source": [ + "### Speculative decoding\n", + "Speculative Decoding was introduced in the paper [Fast Inference from Transformers via Speculative Decoding](https://arxiv.org/abs/2211.17192).\n", + "In this method the next tokens in the sequence are speculated using another smaller and much faster model which is called a draft model.\n", + "The only constraint we have on the draft model is that it has to have the same vocabulary as the target model, in our case Phi-2.\n", + "Phi-2 and CodeGen models share the same vocabulary and therefore we can use a much smaller and faster CodeGen model as a draft model to Phi-2.\n", + "A common metric for assessing if a draft model is performing well is the acceptance rate.\n", + "The acceptance rate measures how many tokens out of the speculated tokens in each window are accepted by the target model.\n", + "A higher acceptance rate will ensure a higher speedup and therefore it is a very important metric to measure when choosing a draft model.\n", + "\n", + "In this example we will use [CodeGen-350M-Multi](https://huggingface.co/Salesforce/codegen-350M-multi) as a draft model, it has 350M parameters which is ~10x smaller than Phi-2.\n", + "Next, we will prepare our chosen draft model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c996ba6-ef66-42a2-9bb4-2320372e4167", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"Salesforce/codegen-350M-multi\"\n", + "save_name = model_name.split(\"/\")[-1] + \"_openvino_stateless\"\n", + "precision = \"f32\"\n", + "quantization_config = OVWeightQuantizationConfig(\n", + " bits=4,\n", + " sym=False,\n", + " group_size=128,\n", + " ratio=0.8,\n", + ")\n", + "device = \"cpu\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb366707-4b99-4c79-a235-d3c887136965", + "metadata": {}, + "outputs": [], + "source": [ + "# Load kwargs\n", + "load_kwargs = {\n", + " \"device\": device,\n", + " \"ov_config\": {\n", + " \"PERFORMANCE_HINT\": \"LATENCY\",\n", + " \"INFERENCE_PRECISION_HINT\": precision,\n", + " \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n", + " },\n", + " \"compile\": False,\n", + " \"quantization_config\": quantization_config\n", + "}\n", + "\n", + "# Check whether the model was already exported\n", + "saved = os.path.exists(save_name)\n", + "\n", + "asst_model = OVModelForCausalLM.from_pretrained(\n", + " model_name if not saved else save_name,\n", + " export=not saved,\n", + " stateful=False,\n", + " **load_kwargs,\n", + ")\n", + "\n", + "# Save the exported model locally\n", + "if not saved:\n", + " asst_model.save_pretrained(save_name)\n", + " tokenizer.save_pretrained(save_name)\n", + "\n", + "asst_model.compile()" + ] + }, + { + "cell_type": "markdown", + "id": "4a95efed-22ce-43a0-af2a-e27500cfa514", + "metadata": {}, + "source": [ + "We will set the configuration of the draft model to predict 3 tokens at each forward step, we found that this setting works quite well in the current setup." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1466938c-0945-4eb6-a80f-dd165cc5eca1", + "metadata": {}, + "outputs": [], + "source": [ + "asst_model.generation_config.num_assistant_tokens = 3\n", + "asst_model.generation_config.num_assistant_tokens_schedule = \"const\"" + ] + }, + { + "cell_type": "markdown", + "id": "74f6b4c4-4d8a-47fd-8172-6502cc5eef29", + "metadata": {}, + "source": [ + "Next, we will run the same example from before with speculative decoding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a7e1516-6521-4346-bf85-5890341336f0", + "metadata": {}, + "outputs": [], + "source": [ + "out = stateless_model.generate(\n", + " **inputs,\n", + " max_new_tokens=128,\n", + " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " assistant_model=asst_model,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "dab6669b-f3f1-411e-b4b8-31ead823247f", + "metadata": {}, + "source": [ + "Note that in both cases of AG we presented, the generation result is exactly the same as Phi-2 would have generated it without AG!\n", + "\n", + "Like we mentioned before, the acceptance rate (AR) is a very important metric for choosing a draft.\n", + "We would like to make sure that CodeGen has a good AR with Phi-2.\n", + "For that purpose we implemented an easy utility class that uses the inputs' lengths and window sizes to calculate how many tokens were accepted by the target model at each step and calculate the AR using that information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "480d3e69-0899-4fa8-a85a-cd5a2ce23434", + "metadata": {}, + "outputs": [], + "source": [ + "from functools import wraps\n", + "import numpy as np\n", + "\n", + "\n", + "class AcceptanceRateRecorder:\n", + " def __init__(self, model):\n", + " self.model = model\n", + " self.model_forward = None\n", + " self.model_generate = None\n", + " self.seq_lens = []\n", + " self.win_sizes = []\n", + "\n", + " def __enter__(self):\n", + " # wrap forward method\n", + " if len(self.seq_lens) > 0 or len(self.win_sizes) > 0:\n", + " raise RuntimeError(\"Always use a new instance, don't reuse!\")\n", + " self.model_forward = self.model.forward\n", + " \n", + " @wraps(self.model_forward)\n", + " def forward_wrapper(**kwargs):\n", + " self.seq_lens[-1].append(kwargs.get(\"attention_mask\").shape[-1])\n", + " self.win_sizes[-1].append(kwargs.get(\"input_ids\").shape[-1] - 1)\n", + " return self.model_forward(**kwargs)\n", + " \n", + " self.model.forward = forward_wrapper\n", + " \n", + " # wrap generate method\n", + " self.model_generate = self.model.generate\n", + "\n", + " @wraps(self.model_generate)\n", + " def generate_wrapper(*args, **kwargs):\n", + " self.seq_lens.append([])\n", + " self.win_sizes.append([])\n", + " input_ids = args[0] if len(args) > 0 else kwargs.get(\"input_ids\")\n", + " self.seq_lens[-1].append(input_ids.shape[-1])\n", + " out = self.model_generate(*args, **kwargs)\n", + " self.seq_lens[-1].append(out.shape[-1])\n", + " return out\n", + " self.model.generate = generate_wrapper\n", + " return self\n", + "\n", + " def __exit__(self, type, value, traceback):\n", + " self.model.forward = self.model_forward\n", + " self.model.generate = self.model_generate\n", + " self.model_forward = None\n", + " self.model_generate = None\n", + " # Fix first window size\n", + " for ws, sl in zip(self.win_sizes, self.seq_lens):\n", + " ws[0] -= sl[0] - 1\n", + " # Delete first seq_len, not needed anymore\n", + " self.seq_lens = [sl[1:] for sl in self.seq_lens]\n", + " # Add window size for output to ease calculation later\n", + " for ws, sl in zip(self.win_sizes, self.seq_lens):\n", + " ws.append(0) \n", + "\n", + " def acceptance_rate(self, return_mean=True, normalize=False):\n", + " # ar_per_win = ((cur_seq_len - cur_win_size) - (prev_seq_len - prev_win_size) - 1) / prev_win_size\n", + " ar_per_win = []\n", + " for sl, ws in zip(self.seq_lens, self.win_sizes):\n", + " sl = np.array(sl, dtype=np.float64)\n", + " ws = np.array(ws, dtype=np.float64)\n", + " out_lens = sl - ws\n", + " accepted = (out_lens[1:] - out_lens[:-1] - 1)\n", + " ar_per_win.append(np.divide(accepted, ws[:-1],\n", + " out=np.zeros_like(accepted),where=ws[:-1] != 0))\n", + " ar_per_win = np.hstack(ar_per_win)\n", + " # Normalized AR doesn't take into account windows with size 0\n", + " if normalize:\n", + " ar_per_win = ar_per_win[np.nonzero(np.hstack([ws[:-1] for ws in self.win_sizes]))]\n", + " return np.mean(ar_per_win) if return_mean else ar_per_win" + ] + }, + { + "cell_type": "markdown", + "id": "c35f5e0c-5ed6-4011-a295-80a81fea8b8e", + "metadata": {}, + "source": [ + "Now we can use any dataset for text generation task and measure the AR on that dataset.\n", + "Here we use the [HumanEval](https://huggingface.co/datasets/openai_humaneval) dataset for evaluating code generation.\n", + "We run the model with speculative decoding on 30 samples.\n", + "As you will see, we are getting a very good AR of ~75% for the current configuration.\n", + "\n", + "Note that running this test can take a few minutes depending on the number of samples you are evaluating" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "681a4974-43df-4934-8b61-75c3a92b6df1", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "from datasets import load_dataset\n", + "\n", + "dataset_name = \"openai_humaneval\"\n", + "dataset_subset_name = None\n", + "field_name = \"prompt\"\n", + "prompt_template = \"\"\"{text}\"\"\"\n", + "dataset = load_dataset(dataset_name, dataset_subset_name, split=\"test\")[field_name]\n", + "samples_number = 30\n", + "with AcceptanceRateRecorder(stateless_model) as ar_recorder:\n", + " for text in tqdm(dataset[:samples_number]):\n", + " tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors='pt')\n", + " stateless_model.generate(\n", + " **tokenized_prompt,\n", + " max_new_tokens=128,\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " assistant_model=asst_model,\n", + " )\n", + "print(f\"Acceptance rate: {ar_recorder.acceptance_rate() * 100:.2f}%\")" + ] + }, { "cell_type": "markdown", "id": "3f8aa25c-de59-4e79-9a1f-c03ec76d206a", "metadata": {}, "source": [ "## Chatbot demo\n", - "We will continue to build a chatbot demo running with Gradio using the model we just exported and quantized.\n", + "We will continue to build a chatbot demo running with Gradio using the models we just exported and quantized.\n", "The chatbot will be rather simple where the user will input a message and the model will reply to the user by generating text using the entire chat history as the input to the model.\n", + "We will also add an option to accelerate inference using speculative decoding with a draft model as we described in the previous section.\n", "\n", "A lot of models that were trained for the chatbot use case have been trained with special tokens to tell the model who is the current speaker and with a special system message. \n", "Phi-2 wasn't trained specifically for the chatbot use case and doesn't have any special tokens either, however, it has seen chats in the training data and therefore is suited for that use case.\n", @@ -328,7 +660,7 @@ " return input_token\n", "\n", "\n", - "def generate(history, temperature, max_new_tokens, top_p, repetition_penalty):\n", + "def generate(history, temperature, max_new_tokens, top_p, repetition_penalty, assisted):\n", " \"\"\"\n", " Generates the assistant's reponse given the chatbot history and generation parameters\n", "\n", @@ -339,6 +671,7 @@ " max_new_tokens: The maximum number of tokens we allow the model to generate as a response.\n", " top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability.\n", " repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.\n", + " assisted: boolean parameter to enable/disable assisted generation with speculative decoding.\n", " Yields:\n", " Updated history and generation status.\n", " \"\"\"\n", @@ -354,15 +687,15 @@ " inputs = prepare_history_for_model(history)\n", " input_length = inputs['input_ids'].shape[1]\n", "\n", - " prompt_char = '▌'\n", + " prompt_char = \"▌\"\n", " history[-1][1] = prompt_char\n", - " yield (history, \"Status: Generating...\")\n", + " yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n", " \n", " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", "\n", " # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n", - " stop_str = f'\\nUser:'\n", - " stopping_criteria = StoppingCriteriaList([SuffixCriteria(input_length, [stop_str], tokenizer)])\n", + " stop_str = [\"\\nUser:\", \"\\nAssistant:\", \"\\nRules:\", \"\\nQuestion:\"]\n", + " stopping_criteria = StoppingCriteriaList([SuffixCriteria(input_length, stop_str, tokenizer)])\n", " # Prepare input for generate\n", " generation_config = GenerationConfig(\n", " max_new_tokens=max_new_tokens,\n", @@ -379,7 +712,13 @@ " stopping_criteria=stopping_criteria,\n", " ) | inputs\n", "\n", - " t1 = Thread(target=model.generate, kwargs=generate_kwargs)\n", + " if assisted:\n", + " target_generate = stateless_model.generate\n", + " generate_kwargs[\"assistant_model\"] = asst_model\n", + " else:\n", + " target_generate = model.generate\n", + "\n", + " t1 = Thread(target=target_generate, kwargs=generate_kwargs)\n", " t1.start()\n", "\n", " # Initialize an empty string to store the generated text.\n", @@ -387,17 +726,18 @@ " for new_text in streamer:\n", " partial_text += new_text\n", " history[-1][1] = partial_text + prompt_char\n", - " # We don't yield the generated text until we are sure it is not the stop string\n", - " pos = partial_text.rfind(stop_str)\n", + " for s in stop_str:\n", + " if (pos := partial_text.rfind(s)) != -1:\n", + " break\n", " if pos != -1:\n", " partial_text = partial_text[:pos]\n", " break\n", - " elif is_partial_stop(partial_text, stop_str):\n", + " elif any([is_partial_stop(partial_text, s) for s in stop_str]):\n", " continue\n", - " yield (history, \"Status: Generating...\")\n", + " yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n", " history[-1][1] = partial_text\n", " generation_time = time.perf_counter() - start\n", - " yield (history, f'Generation time: {generation_time:.2f} sec')" + " yield history, f'Generation time: {generation_time:.2f} sec', *([gr.update(interactive=True)] * 4)" ] }, { @@ -430,6 +770,11 @@ "source": [ "import gradio as gr\n", "\n", + "try:\n", + " demo.close()\n", + "except:\n", + " pass\n", + "\n", "\n", "EXAMPLES = [\n", " [\"What is OpenVINO?\"],\n", @@ -455,14 +800,29 @@ " return ('', history)\n", "\n", "\n", + "def prepare_for_regenerate(history):\n", + " \"\"\"\n", + " Delete last assistant message to prepare for regeneration\n", + "\n", + " Params:\n", + " history: conversation history\n", + " Returns:\n", + " updated history\n", + " \"\"\" \n", + " history[-1][1] = None\n", + " return history\n", + "\n", + "\n", "with gr.Blocks(theme=gr.themes.Soft()) as demo:\n", " gr.Markdown('

Chat with Phi-2 on Meteor Lake iGPU

')\n", " chatbot = gr.Chatbot()\n", " with gr.Row():\n", + " assisted = gr.Checkbox(value=False, label=\"Assisted Generation\", scale=10)\n", " msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n", - " status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=25)\n", + " status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=15)\n", " with gr.Row():\n", " submit = gr.Button(\"Submit\", variant='primary')\n", + " regenerate = gr.Button(\"Regenerate\")\n", " clear = gr.Button(\"Clear\")\n", " with gr.Accordion(\"Advanced Options:\", open=False):\n", " with gr.Row():\n", @@ -513,12 +873,24 @@ " queue=False,\n", " ).then(\n", " fn=generate,\n", - " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty],\n", - " outputs=[chatbot, status],\n", + " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n", + " outputs=[chatbot, status, msg, submit, regenerate, clear],\n", + " concurrency_limit=1,\n", + " queue=True\n", + " )\n", + " regenerate.click(\n", + " fn=prepare_for_regenerate,\n", + " inputs=chatbot,\n", + " outputs=chatbot,\n", + " queue=True,\n", + " concurrency_limit=1\n", + " ).then(\n", + " fn=generate,\n", + " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n", + " outputs=[chatbot, status, msg, submit, regenerate, clear],\n", " concurrency_limit=1,\n", " queue=True\n", " )\n", - " \n", " clear.click(fn=lambda: (None, \"Status: Idle\"), inputs=None, outputs=[chatbot, status], queue=False)" ] }, @@ -575,7 +947,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/notebooks/openvino/question_answering_quantization.ipynb b/notebooks/openvino/question_answering_quantization.ipynb index ba4a84ca38..196e3ba6a7 100644 --- a/notebooks/openvino/question_answering_quantization.ipynb +++ b/notebooks/openvino/question_answering_quantization.ipynb @@ -51,7 +51,7 @@ "import transformers\n", "from evaluate import evaluator\n", "from openvino.runtime import Core\n", - "from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer\n", + "from optimum.intel.openvino import OVModelForQuestionAnswering, OVQuantizer, OVQuantizationConfig, OVConfig\n", "from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n", "\n", "transformers.logging.set_verbosity_error()\n", @@ -286,11 +286,11 @@ "**NOTE:** if you notice very low accuracy after post-training quantization, it is likely caused by an overflow issue which affects processors that do not contain VNNI (Vector Neural Network Instruction). NNCF has an `overflow_fix` option to address this. It will effectively use 7-bits for quantizing instead of 8-bits to prevent the overflow. To use this option, modify the code in the next cell to add an explicit quantization configuration, and set `overflow_fix` to `\"enable\"`:\n", "\n", "```\n", - "from optimum.intel.openvino import OVConfig\n", + "from optimum.intel.openvino import OVConfig, OVQuantizationConfig\n", "\n", - "ov_config = OVConfig()\n", - "ov_config.compression[\"overflow_fix\"] = \"enable\"\n", - "quantizer = OVQuantizer.from_pretrained(model, ov_config=ov_config)\n", + "ov_config = OVConfig(quantization_config=OVQuantizationConfig(overflow_fix=\"enable\")\n", + "quantizer = OVQuantizer.from_pretrained(model)\n", + "quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path, ov_config=ov_config)\n", "```\n", "\n", "For more information, see [Lower Numerical Precision Deep Learning Inference and Training](https://www.intel.com/content/www/us/en/developer/articles/technical/lower-numerical-precision-deep-learning-inference-and-training.html)" @@ -317,7 +317,8 @@ "\n", "# Quantize the model\n", "quantizer = OVQuantizer.from_pretrained(model)\n", - "quantizer.quantize(calibration_dataset=train_dataset, save_directory=int8_ptq_model_path)" + "ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n", + "quantizer.quantize(calibration_dataset=train_dataset, ov_config=ov_config, save_directory=int8_ptq_model_path)" ] }, { diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb index b5d9ae7001..41969b162a 100644 --- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb +++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "id": "dffab375-a730-4015-8d17-360b76a0718d", "metadata": {}, "outputs": [], @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "0407fc92-c052-47b7-8721-01836adf3b54", "metadata": { "execution": { @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "id": "c32f9a76-414b-43d9-9769-af131223f1c1", "metadata": { "execution": { @@ -125,14 +125,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "id": "92a3f434", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "caab335ab7b146bba41c81e8688759f9", + "model_id": "31afdad1b284494aa51e668f5d8fc5c8", "version_major": 2, "version_minor": 0 }, @@ -147,7 +147,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'image': , 'filename': 'COCO_train2014_000000494175.jpg', 'cocoid': 494175, 'caption': 'A crowded arena filled with people and confetti.'}\n" + "{'image': , 'filename': 'COCO_train2014_000000494175.jpg', 'cocoid': 494175, 'caption': 'A crowded arena filled with people and confetti.'}\n" ] } ], @@ -158,24 +158,18 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "8be68958-ce5e-4cc6-b8e7-2867feaf084b", - "metadata": { - "execution": { - "iopub.execute_input": "2022-12-04T14:40:31.358230Z", - "iopub.status.busy": "2022-12-04T14:40:31.358053Z", - "iopub.status.idle": "2022-12-04T14:40:31.360666Z", - "shell.execute_reply": "2022-12-04T14:40:31.360301Z", - "shell.execute_reply.started": "2022-12-04T14:40:31.358218Z" - }, - "tags": [] - }, + "execution_count": 5, + "id": "1036fe23", + "metadata": {}, "outputs": [], "source": [ "def preprocess_fn(example):\n", " return {\"prompt\": example[\"caption\"]}\n", "\n", - "calibration_dataset = dataset.map(lambda x: preprocess_fn(x), remove_columns=dataset.column_names)" + "NUM_SAMPLES = 200\n", + "dataset = dataset.take(NUM_SAMPLES)\n", + "dataset = dataset.map(lambda x: preprocess_fn(x), remove_columns=dataset.column_names)\n", + "calibration_dataset = list(dataset)" ] }, { @@ -190,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "31c5415e-e22b-4ab9-b903-8791e80b188d", "metadata": { "execution": { @@ -204,7 +198,6 @@ }, "outputs": [], "source": [ - "NUM_SAMPLES = 200\n", "quantization_config = OVWeightQuantizationConfig(bits=8, dataset=calibration_dataset, num_samples=NUM_SAMPLES)\n", "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True, quantization_config=quantization_config)\n", "int8_pipe.save_pretrained(int8_model_path)" @@ -251,7 +244,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "6c2f615a-19e3-4ee2-9309-2ae1392c7f62", "metadata": { "execution": { @@ -290,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "90902149", "metadata": {}, "outputs": [], @@ -311,7 +304,36 @@ "execution_count": null, "id": "02f01fc1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "eb96ea3ef90f4b7488cb7b92853b5ef7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00" ] @@ -350,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "1eeaa81f-7fc5-49ba-80b8-2d95a1310a0c", "metadata": { "execution": { @@ -388,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "91134d48", "metadata": {}, "outputs": [ @@ -397,7 +419,7 @@ "output_type": "stream", "text": [ "FP32 model size: 4920.93 MB\n", - "INT8 model size: 1240.23 MB\n", + "INT8 model size: 1240.29 MB\n", "INT8 size decrease: 3.97x\n" ] } @@ -424,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "8806da79-0b3b-403e-a40c-61db6a0f482d", "metadata": { "execution": { @@ -447,7 +469,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6816982d5d7e412fb02d1efcf972a95d", + "model_id": "bdf72f14022b4944b2fe6f5e482b01f2", "version_major": 2, "version_minor": 0 }, @@ -461,7 +483,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7c51ada2ecb84e6398f82c10acd523ae", + "model_id": "fd9c55cebb4646b7b606a4d6b177889b", "version_major": 2, "version_minor": 0 }, @@ -475,7 +497,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7bf9616672ef4743a5e05f519e47ccae", + "model_id": "2599d0b1599a4bedac510cd0382262d0", "version_major": 2, "version_minor": 0 }, @@ -490,13 +512,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Latency of original FP32 model: 355705.40 ms\n" + "Latency of original FP32 model: 212177.56 ms\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2a8db52629c7434da87864707e1c8023", + "model_id": "8ca3ba9b0dca486dbc5fa6c5b508f1b8", "version_major": 2, "version_minor": 0 }, @@ -510,7 +532,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f65d645f05464727bb05b7a0028353fb", + "model_id": "27a597e75cc44a3db5360bbde7b613e8", "version_major": 2, "version_minor": 0 }, @@ -524,7 +546,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f1452100a86443e29f4d6f62473cfecf", + "model_id": "f15ec840672b4568aea1882539d7bb33", "version_major": 2, "version_minor": 0 }, @@ -539,14 +561,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Latency of quantized model: 315484.52 ms\n", - "Speedup: 1.13x\n" + "Latency of quantized model: 162504.42 ms\n", + "Speedup: 1.31x\n" ] } ], "source": [ "def get_val_dataset(num_items=3):\n", - " return [item[\"caption\"] for item in dataset.take(num_items)]\n", + " return [item[\"prompt\"] for item in dataset.take(num_items)]\n", "\n", "def benchmark(pipeline, dataset):\n", " \"\"\"\n", diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 997ec44aa5..cdae847468 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Optional from ...exporters import TasksManager +from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available from ..base import BaseOptimumCLICommand, CommandInfo @@ -104,6 +105,16 @@ def parse_args_openvino(parser: "ArgumentParser"): default=None, help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."), ) + optional_group.add_argument( + "--dataset", + type=str, + default=None, + help=( + "The dataset used for data-aware compression or quantization with NNCF. " + "You can use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs " + "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models." + ), + ) optional_group.add_argument( "--disable-stateful", action="store_true", @@ -115,10 +126,23 @@ def parse_args_openvino(parser: "ArgumentParser"): "OpenVINO native inference code that expects kv-cache inputs and outputs in the model." ), ) + optional_group.add_argument( + "--disable-convert-tokenizer", + action="store_true", + help="Do not add converted tokenizer and detokenizer OpenVINO models.", + ) optional_group.add_argument( "--convert-tokenizer", action="store_true", - help="Add converted tokenizer and detokenizer with OpenVINO Tokenizers", + help="[Deprecated] Add converted tokenizer and detokenizer with OpenVINO Tokenizers.", + ) + + optional_group.add_argument( + "--library", + type=str, + choices=["transformers", "diffusers", "timm", "sentence_transformers"], + default=None, + help=("The library on the model. If not provided, will attempt to infer the local checkpoint's library"), ) @@ -187,19 +211,68 @@ def run(self): ) quantization_config["sym"] = "asym" not in self.args.weight_format quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64 + quantization_config["dataset"] = self.args.dataset ov_config = OVConfig(quantization_config=quantization_config) - # TODO : add input shapes - main_export( - model_name_or_path=self.args.model, - output=self.args.output, - task=self.args.task, - framework=self.args.framework, - cache_dir=self.args.cache_dir, - trust_remote_code=self.args.trust_remote_code, - pad_token_id=self.args.pad_token_id, - ov_config=ov_config, - stateful=not self.args.disable_stateful, - convert_tokenizer=self.args.convert_tokenizer, - # **input_shapes, - ) + library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library) + if library_name == "sentence_transformers" and self.args.library is None: + logger.warning( + "Library name is not specified. There are multiple possible variants: `sentence_transformers`, `transformers`." + "`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers" + ) + library_name = "transformers" + + if ( + library_name == "diffusers" + and ov_config + and ov_config.quantization_config + and ov_config.quantization_config.dataset is not None + ): + if not is_diffusers_available(): + raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models")) + + from diffusers import DiffusionPipeline + + diffusers_config = DiffusionPipeline.load_config(self.args.model) + class_name = diffusers_config.get("_class_name", None) + + if class_name == "LatentConsistencyModelPipeline": + from optimum.intel import OVLatentConsistencyModelPipeline + + model_cls = OVLatentConsistencyModelPipeline + + elif class_name == "StableDiffusionXLPipeline": + from optimum.intel import OVStableDiffusionXLPipeline + + model_cls = OVStableDiffusionXLPipeline + elif class_name == "StableDiffusionPipeline": + from optimum.intel import OVStableDiffusionPipeline + + model_cls = OVStableDiffusionPipeline + else: + raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.") + + model = model_cls.from_pretrained( + self.args.model, export=True, quantization_config=ov_config.quantization_config + ) + model.save_pretrained(self.args.output) + + else: + if self.args.convert_tokenizer: + logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.") + + # TODO : add input shapes + main_export( + model_name_or_path=self.args.model, + output=self.args.output, + task=self.args.task, + framework=self.args.framework, + cache_dir=self.args.cache_dir, + trust_remote_code=self.args.trust_remote_code, + pad_token_id=self.args.pad_token_id, + ov_config=ov_config, + stateful=not self.args.disable_stateful, + convert_tokenizer=not self.args.disable_convert_tokenizer, + library_name=library_name, + # **input_shapes, + ) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 5d6e31ebac..d7b29584d6 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -22,11 +22,10 @@ from optimum.exporters import TasksManager from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED +from optimum.exporters.openvino.convert import export_from_model, export_tokenizer +from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version from optimum.utils.save_utils import maybe_load_preprocessors -from ...intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version -from .convert import export_from_model, export_tokenizer - if TYPE_CHECKING: from optimum.intel.openvino.configuration import OVConfig @@ -77,7 +76,7 @@ def main_export( model_name_or_path (`str`): Model ID on huggingface.co or path on disk to the model repository to export. output (`Union[str, Path]`): - Path indicating the directory where to store the generated ONNX model. + Path indicating the directory where to store the generated OpenVINO model. > Optional parameters @@ -163,10 +162,18 @@ def main_export( original_task = task task = TasksManager.map_from_synonym(task) framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework) + library_name_is_not_provided = library_name is None library_name = TasksManager.infer_library_from_model( model_name_or_path, subfolder=subfolder, library_name=library_name ) + if library_name == "sentence_transformers" and library_name_is_not_provided: + logger.warning( + "Library name is not specified. There are multiple possible variants: `sentence_tenasformers`, `transformers`." + "`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers" + ) + library_name = "transformers" + if task == "auto": try: task = TasksManager.infer_task_from_model(model_name_or_path) @@ -179,12 +186,6 @@ def main_export( f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" ) - if convert_tokenizer and not is_openvino_tokenizers_available(): - logger.warning( - "`convert_tokenizer` requires openvino-tokenizers, please install it with `pip install optimum-intel[openvino-tokenizers]`" - ) - convert_tokenizer = False - do_gptq_patching = False custom_architecture = False loading_kwargs = {} @@ -202,7 +203,6 @@ def main_export( quantization_config = getattr(config, "quantization_config", None) do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq" model_type = config.model_type.replace("_", "-") - if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True elif task not in TasksManager.get_supported_tasks_for_model_type( @@ -220,6 +220,20 @@ def main_export( ) if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED: loading_kwargs["attn_implementation"] = "eager" + # there are some difference between remote and in library representation of past key values for some models, + # for avoiding confusion we disable remote code for them + if ( + trust_remote_code + and model_type in {"falcon", "mpt", "phi"} + and ("with-past" in task or original_task == "auto") + and not custom_export_configs + ): + logger.warning( + f"Model type `{model_type}` export for task `{task}` is not supported for loading with `trust_remote_code=True`" + "using default export configuration, `trust_remote_code` will be disabled. " + "Please provide custom export config if you want load model with remote code." + ) + trust_remote_code = False # Patch the modules to export of GPTQ models w/o GPU if do_gptq_patching: @@ -327,7 +341,7 @@ class StoreAttr(object): **kwargs_shapes, ) - if convert_tokenizer: + if convert_tokenizer and is_openvino_tokenizers_available(): if library_name != "diffusers": tokenizer = next( (preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)), @@ -350,6 +364,8 @@ class StoreAttr(object): tokenizer_2 = getattr(model, "tokenizer_2", None) if tokenizer_2 is not None: export_tokenizer(tokenizer_2, output, suffix="_2") + elif convert_tokenizer and not is_openvino_tokenizers_available(): + logger.warning("Tokenizer won't be converted.") # Unpatch modules after GPTQ export if do_gptq_patching: diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 98dd22d824..55e3318017 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -20,7 +20,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union -from transformers import T5Tokenizer, T5TokenizerFast from transformers.utils import is_tf_available, is_torch_available from openvino.runtime import PartialShape, save_model @@ -49,9 +48,6 @@ ) -UNSUPPORTED_TOKENIZER_CLASSES = (T5Tokenizer, T5TokenizerFast) - - logger = logging.getLogger(__name__) if is_torch_available(): @@ -345,7 +341,7 @@ def ts_patched_forward(*args, **kwargs): input_dict = dict(zip(keys, tuple_input)) kwargs[input_name] = input_dict outputs = patched_forward(*args, **kwargs) - return tuple(outputs.values()) + return tuple([value if not isinstance(value, list) else tuple(value) for value in outputs.values()]) patcher.patched_forward = ts_patched_forward @@ -382,6 +378,8 @@ def ts_patched_forward(*args, **kwargs): sig = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.call) ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs} + if not ordered_dummy_inputs: + ordered_dummy_inputs = dummy_inputs ordered_input_names = list(inputs) flatten_inputs = flattenize_inputs(ordered_dummy_inputs.values()) ov_model.validate_nodes_and_infer_types() @@ -564,6 +562,7 @@ def export_from_model( kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name] ) + logging.disable(logging.INFO) export_config, models_and_export_configs = _get_submodels_and_export_configs( model=model, task=task, @@ -578,6 +577,7 @@ def export_from_model( legacy=False, exporter="openvino", ) + logging.disable(logging.NOTSET) if ov_config is None: if library_name == "diffusers": @@ -658,10 +658,6 @@ def export_tokenizer( ): from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME # avoid circular imports - if isinstance(tokenizer, UNSUPPORTED_TOKENIZER_CLASSES): - logger.info(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.") - return - try: from openvino_tokenizers import convert_tokenizer except ModuleNotFoundError: @@ -677,13 +673,13 @@ def export_tokenizer( try: converted = convert_tokenizer(tokenizer, with_detokenizer=True) except NotImplementedError: - logger.warning("Detokenizer is not supported, convert tokenizer only.") + logger.info("Detokenizer is not supported, convert tokenizer only.") converted = convert_tokenizer(tokenizer, with_detokenizer=False) except OVTypeError: - logger.warning(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.") + logger.debug(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.") return except Exception as exception: - logger.warning( + logger.debug( f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported. Exception: {exception}" ) return diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a274b3671d..90297c8fb3 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -19,7 +19,7 @@ from transformers.utils import is_tf_available from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig -from optimum.exporters.onnx.model_configs import GemmaOnnxConfig +from optimum.exporters.onnx.model_configs import GemmaOnnxConfig, LlamaOnnxConfig from optimum.exporters.tasks import TasksManager from optimum.utils import DEFAULT_DUMMY_SHAPES from optimum.utils.input_generators import ( @@ -34,6 +34,7 @@ BaichuanModelPatcher, ChatGLMModelPatcher, GemmaModelPatcher, + LlamaModelPatcher, MixtralModelPatcher, QwenModelPatcher, ) @@ -74,7 +75,7 @@ def init_model_configs(): @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") -class BaichaunOpenVINOConfig(TextDecoderOnnxConfig): +class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" @@ -274,6 +275,24 @@ def patch_model_for_export( return GemmaModelPatcher(self, model, model_kwargs=model_kwargs) +@register_in_tasks_manager( + "llama", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class LlamaOpenVINOConfig(LlamaOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return LlamaModelPatcher(self, model, model_kwargs=model_kwargs) + + class QwenDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): def __init__( self, @@ -400,3 +419,21 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers") +class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers") +class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 2cedf64b0a..3649c163c6 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -288,10 +288,74 @@ def __exit__(self, exc_type, exc_value, traceback): block.self_attention.core_attention.forward = block.self_attention.core_attention._orig_forward +# adopted from +# https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/gemma/modeling_gemma.py#L965 +# https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/llama/modeling_llama.py#L1058 +def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_position, **kwargs): + from transformers.modeling_attn_mask_utils import AttentionMaskConverter + + # for compatibility with https://github.com/huggingface/transformers/pull/30047 + current_length = kwargs.get("current_length", cache_position[-1]) + dtype, device = input_tensor.dtype, input_tensor.device + + # using minimum from dtype with larger bandwith (floa32) may lead to overflow + # during execution on platforms with default lower precision (bfloat16, float16) + min_dtype = torch.finfo(torch.float16).min + sequence_length = input_tensor.shape[1] + if hasattr(getattr(self.layers[0], "self_attn", {}), "past_key_value"): # static cache + target_length = self.config.max_position_embeddings + else: # dynamic cache + target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else current_length + 1 + + causal_mask = torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.dim() == 2: + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0) + causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype) + elif attention_mask.dim() == 4: + # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with + # cache. In that case, the 4D attention mask attends to the newest tokens only. + if attention_mask.shape[-2] < cache_position[0] + sequence_length: + offset = cache_position[0] + else: + offset = 0 + mask_shape = attention_mask.shape + mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + class GemmaModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() + # gemma has some accuracy issues with bf16 with transformers >= 4.39 + # fill causal mask in slightly different way for avoid overflow on some platforms + if is_transformers_version(">=", "4.39.0"): + self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask + self._model.model._update_causal_mask = types.MethodType( + _llama_gemma_update_causal_mask, self._model.model + ) + # init inv_freq for torchscript tracing # https://github.com/huggingface/transformers/blob/ed74d97871468f3a4695ede50abdc0b55717a84d/src/transformers/models/gemma/modeling_gemma.py#L108 for layer in self._model.model.layers: @@ -301,6 +365,29 @@ def __enter__(self): rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim) ) + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + if hasattr(self._model.model, "_orig_update_causal_mask"): + self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + + +class LlamaModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + + # llama has some accuracy issues with bf16 with transformers >= 4.39 + # fill causal mask in slightly different way for avoid overflow on some platforms + if is_transformers_version(">=", "4.39.0"): + self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask + self._model.model._update_causal_mask = types.MethodType( + _llama_gemma_update_causal_mask, self._model.model + ) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + if hasattr(self._model.model, "_orig_update_causal_mask"): + self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + SUPPORT_SDPA = is_torch_version(">", "2.1.0") @@ -444,9 +531,10 @@ def _qwen_attention_forward( value = value.permute(0, 2, 1, 3) if not self.use_cache_quantization and SUPPORT_SDPA: - causal_mask = registered_causal_mask[:, :, key.size(-2) - query.size(-2) : key.size(-2), : key.size(-2)] + # For performance, using constant tril to generate causal_mask + causal_mask = self.bias[:, :, key.size(-2) - query.size(-2) : key.size(-2), : key.size(-2)] if attention_mask is not None: - attention_mask = attention_mask.expand(-1, -1, causal_mask.size(2), -1).masked_fill( + attention_mask = attention_mask.expand(-1, -1, query.size(2), -1).masked_fill( ~causal_mask, torch.finfo(query.dtype).min ) else: @@ -465,7 +553,6 @@ def _qwen_attention_forward( raise ValueError("Cannot output attentions while using flash-attn") else: outputs += (attn_weight,) - return outputs @@ -492,8 +579,17 @@ def __init__( def __enter__(self): super().__enter__() + max_positions = self._model.config.seq_length for block in self._model.transformer.h: block.attn._orig_forward = block.attn.forward + # For performance, using constant tril to generate causal_mask + block.attn.register_buffer( + "bias", + torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( + 1, 1, max_positions, max_positions + ), + persistent=False, + ) block.attn.forward = types.MethodType(_qwen_attention_forward, block.attn) def __exit__(self, exc_type, exc_value, traceback): @@ -513,5 +609,5 @@ def __init__( ): super().__init__(config, model, model_kwargs) # model has first inference buffers initialization - if self._model.lm_head.first_flag: + if hasattr(self._model.lm_head, "first_flag"): self._model(torch.ones((1, 10), dtype=torch.int64), torch.ones((1, 10), dtype=torch.int64)) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 29abd00034..f9234cb3b1 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -59,9 +59,13 @@ if not (is_openvino_available() and is_nncf_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - _import_structure["utils.dummy_openvino_and_nncf_objects"].extend(["OVQuantizer", "OVTrainingArguments"]) + _import_structure["utils.dummy_openvino_and_nncf_objects"].extend( + ["OVQuantizer", "OVTrainingArguments", "OVQuantizationConfig", "OVWeightQuantizationConfig"] + ) else: - _import_structure["openvino"].extend(["OVQuantizer", "OVTrainingArguments"]) + _import_structure["openvino"].extend( + ["OVQuantizer", "OVTrainingArguments", "OVQuantizationConfig", "OVWeightQuantizationConfig"] + ) try: @@ -112,8 +116,9 @@ "OVModelForAudioClassification", "OVModelForAudioFrameClassification", "OVModelForAudioXVector", - "OVModelForCTC", "OVModelForCausalLM", + "OVModelForCTC", + "OVModelForCustomTasks", "OVModelForFeatureExtraction", "OVModelForImageClassification", "OVModelForMaskedLM", @@ -124,7 +129,6 @@ "OVModelForVision2Seq", "OVModelForSequenceClassification", "OVModelForTokenClassification", - "OVWeightQuantizationConfig", "OVConfig", ] ) @@ -187,9 +191,14 @@ if not (is_openvino_available() and is_nncf_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from .utils.dummy_openvino_and_nncf_objects import OVQuantizer, OVTrainingArguments + from .utils.dummy_openvino_and_nncf_objects import ( + OVQuantizationConfig, + OVQuantizer, + OVTrainingArguments, + OVWeightQuantizationConfig, + ) else: - from .openvino import OVQuantizer, OVTrainingArguments + from .openvino import OVQuantizationConfig, OVQuantizer, OVTrainingArguments, OVWeightQuantizationConfig try: if not (is_openvino_available() and is_nncf_available() and is_accelerate_available()): @@ -234,6 +243,7 @@ OVModelForAudioXVector, OVModelForCausalLM, OVModelForCTC, + OVModelForCustomTasks, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, @@ -243,7 +253,6 @@ OVModelForSpeechSeq2Seq, OVModelForTokenClassification, OVModelForVision2Seq, - OVWeightQuantizationConfig, ) try: diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 0664a8e6ac..8a7a4f2028 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -89,6 +89,10 @@ def ipex_jit_trace(model, task, use_cache): model.config.return_dict = False + if "past_key_values" in sample_inputs and use_cache: + # Make sure the model will output past_key_values in generation tasks + model.config.use_cache = True + model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True) # Disable repack while jit tracing to reduce the memory ipex._C.disable_jit_linear_repack() diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index c7a48aedb9..c46e3f41c5 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -67,11 +67,6 @@ """ -if is_intel_extension_for_transformers_available(): - from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM as ITREX_WOQ_MODEL - from intel_extension_for_transformers.transformers.utils import WeightOnlyQuantConfig - - class INCModel(OptimizedModel): auto_model_class = AutoModel export_feature = "feature-extraction" @@ -142,15 +137,16 @@ def _from_pretrained( msg = None if is_intel_extension_for_transformers_available(): try: - quantization_config = WeightOnlyQuantConfig.from_pretrained(model_id) - algorithm = getattr(quantization_config, "algorithm", None) - if algorithm is not None and quantization_config.algorithm.lower() in { - "rtn", - "gptq", - "awq", - "autoaround", - }: - return ITREX_WOQ_MODEL.from_pretrained( + quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json") + algorithm = getattr(quantization_config, "quant_method", None) + if algorithm in {"rtn", "gptq", "awq", "autoaround"}: + from intel_extension_for_transformers.transformers.modeling.modeling_auto import ( + _BaseQBitsAutoModelClass, + ) + + _BaseQBitsAutoModelClass.ORIG_MODEL = cls.auto_model_class + + return _BaseQBitsAutoModelClass.from_pretrained( pretrained_model_name_or_path=model_id, use_auth_token=use_auth_token, revision=revision, diff --git a/optimum/intel/neural_compressor/modeling_decoder.py b/optimum/intel/neural_compressor/modeling_decoder.py deleted file mode 100644 index f56969675b..0000000000 --- a/optimum/intel/neural_compressor/modeling_decoder.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import warnings - -from .modeling_base import INCModelForCausalLM - - -class INCModelForCausalLM(INCModelForCausalLM): - # warning at import time - warnings.warn( - "Importing `INCModelForCausalLM` from `optimum/intel/neural_compressor/modeling_decoder.py` is deprecated and will " - "be removed in a future verson of optimum-intel. Import as `from optimum.intel.neural_compressor import INCModelForCausalLM instead.", - FutureWarning, - ) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index c20302c4bd..09f651df05 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -72,16 +72,22 @@ from .utils import INCDataLoader, _cfgs_to_fx_cfgs +INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION = "1.4.0" + if is_intel_extension_for_transformers_available(): - INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION = "1.3.2" if is_intel_extension_for_transformers_version("!=", INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION): raise ImportError( f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_intel_extension_for_transformers_version}, " f"but only version {INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION} is supported." ) - from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model + from intel_extension_for_transformers.transformers.llm.quantization.utils import convert_to_quantized_model from intel_extension_for_transformers.transformers.modeling.modeling_auto import save_low_bit - from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig + from intel_extension_for_transformers.transformers.utils.config import ( + AwqConfig, + GPTQConfig, + ITREXQuantizationConfigMixin, + RtnConfig, + ) logger = logging.getLogger(__name__) @@ -89,7 +95,7 @@ NEURAL_COMPRESSOR_MINIMUM_VERSION = "2.1.0" NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION = "2.3.0" IPEX_MINIMUM_VERSION = "2.1.0" -_ITREX_TORCH_VERSION = "2.1.0" +ITREX_MINIMUM_TORCH_VERSION = "2.2.0" if is_neural_compressor_version("<", NEURAL_COMPRESSOR_MINIMUM_VERSION): raise ImportError( @@ -152,21 +158,20 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs): def quantize( self, - quantization_config: Union["PostTrainingQuantConfig", "WeightOnlyQuantConfig"], + quantization_config: Union["PostTrainingQuantConfig", "ITREXQuantizationConfigMixin"], save_directory: Union[str, Path], calibration_dataset: Dataset = None, batch_size: int = 8, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, file_name: str = None, - weight_only: bool = False, **kwargs, ): """ Quantize a model given the optimization specifications defined in `quantization_config`. Args: - quantization_config (`Union[PostTrainingQuantConfig, WeightOnlyQuantConfig]`): + quantization_config (`Union[PostTrainingQuantConfig, ITREXQuantizationConfigMixin]`): The configuration containing the parameters related to quantization. save_directory (`Union[str, Path]`): The directory where the quantized model should be saved. @@ -178,9 +183,6 @@ def quantize( The function to use to form a batch from a list of elements of the calibration dataset. remove_unused_columns (`bool`, defaults to `True`): Whether or not to remove the columns unused by the model forward method. - weight_only (`bool`, defaults to `False`): - Whether compress weights to integer precision (4-bit by default) while keeping activations - floating-point. Fits best for LLM footprint reduction and performance acceleration. """ save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) @@ -188,16 +190,41 @@ def quantize( device = kwargs.pop("device", "cpu") use_cpu = device == torch.device("cpu") or device == "cpu" use_xpu = device == torch.device("xpu") or device == "xpu" + calibration_dataloader = None - if save_onnx_model and (isinstance(self._original_model, ORTModel) or weight_only): + if save_onnx_model and isinstance(self._original_model, ORTModel): save_onnx_model = False logger.warning("Model provided is an ONNX model, `save_onnx_model` is set to False") default_name = WEIGHTS_NAME if not isinstance(self._original_model, ORTModel) else ONNX_WEIGHTS_NAME - calibration_dataloader = None self._set_task() - if weight_only or not isinstance(quantization_config, PostTrainingQuantConfig): + if kwargs.pop("weight_only", None) is not None: + logger.warning( + "`weight_only` is deprecated. Use `quantization_config` instead to specify which methodology and quantization pamraters to apply." + ) + + if ( + isinstance(quantization_config, PostTrainingQuantConfig) + and quantization_config.backend == "ipex" + and is_ipex_version("<", IPEX_MINIMUM_VERSION) + and "generation" in self.task + ): + raise ImportError( + f"Found an incompatible version of intel-extension-for-pytorch. Found version {_ipex_version}, " + f"but only version {IPEX_MINIMUM_VERSION} or higher is supported." + ) + + if save_onnx_model: + if ( + not isinstance(quantization_config, PostTrainingQuantConfig) + or INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.DYNAMIC + ): + logger.warning("ONNX export for dynamic and weight only quantized model is not supported.") + save_onnx_model = False + + # ITREX Weight Only Quantization + if not isinstance(quantization_config, PostTrainingQuantConfig): # check neural-compressor version if is_neural_compressor_version("<", NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION): raise ImportError( @@ -207,53 +234,45 @@ def quantize( if not is_intel_extension_for_transformers_available(): raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("Weight only quantization")) - if is_torch_version("!=", _ITREX_TORCH_VERSION): + if is_torch_version("<", ITREX_MINIMUM_TORCH_VERSION): raise ImportError( f"Found an incompatible version of `torch`. Found version {_torch_version}, " - f"but only version {_ITREX_TORCH_VERSION} is supported." + f"but only version {ITREX_MINIMUM_TORCH_VERSION} or higher is supported." ) - if quantization_config is None: - quantization_config = WeightOnlyQuantConfig() - algo = "RTN" - elif isinstance(quantization_config, WeightOnlyQuantConfig): - algo = quantization_config.algorithm - else: + if not isinstance(quantization_config, ITREXQuantizationConfigMixin): raise TypeError( - f"For weight-only quantization, `quantization_config` should be an instance of `WeightOnlyQuantConfig`, but got: {type(quantization_config)} instead." + "`quantization_config` should either be an instance of `neural_compressor.config.PostTrainingQuantConfig` or " + f"`intel_extension_for_transformers.transformers.utils.config.ITREXQuantizationConfigMixin` but got: {type(quantization_config)} instead." ) - if algo not in ["RTN", "GPTQ"]: - raise ValueError(f"Weight-only quantization is only support RTN and GPTQ algorithm now!But got {algo}") + if not isinstance(quantization_config, (GPTQConfig, RtnConfig)): + raise ValueError( + f"Weight-only quantization is only support RTN and GPTQ algorithm now! But got {quantization_config}" + ) - if calibration_dataset is None and quantization_config.tokenizer is None and ("GPTQ" in algo): + if calibration_dataset is None and isinstance(quantization_config, (GPTQConfig, AwqConfig)): raise ValueError( "Weight-only quantization needs a calibration dataset for both GPTQ and AWQ methodologies." ) - if calibration_dataset is None: - calibration_dataloader = None - else: + if calibration_dataset is not None: calibration_dataloader = self._get_calibration_dataloader( calibration_dataset=calibration_dataset, batch_size=batch_size, remove_unused_columns=remove_unused_columns, data_collator=data_collator, - use_label=False if "GPTQ" in algo else True, + use_label=not isinstance(quantization_config, (GPTQConfig)), ) quantization_config.calib_dataloader = calibration_dataloader - save_onnx_model = False - elif INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.STATIC: # Since PyTorch fx trace does not really require an example_inputs, only need calibration_dataset or calibration_fn here. if calibration_dataset is None and self.calibration_fn is None: raise ValueError( "Post-training static quantization needs a calibration dataset or a calibration_function." ) - if calibration_dataset is None: - calibration_dataloader = None - else: + if calibration_dataset is not None: quantization_config.calibration_sampling_size = len(calibration_dataset) calibration_dataloader = self._get_calibration_dataloader( calibration_dataset=calibration_dataset, @@ -266,45 +285,24 @@ def quantize( logger.warning("ONNX export is no supported for model with quantized embeddings") save_onnx_model = False - else: - # Disable ONNX export for dynamically quantized model as deprecated in neural-compressor>=2.2.0 - if save_onnx_model: - logger.warning( - "ONNX export for dynamic quantized model is no longer supported by neural-compressor>=2.2.0. " - "To apply dynamic quantization on an ONNX model, you can use optimum.onnxruntime.ORTQuantizer" - ) - save_onnx_model = False - - if ( - isinstance(quantization_config, PostTrainingQuantConfig) - and quantization_config.backend == "ipex" - and is_ipex_version("<", IPEX_MINIMUM_VERSION) - and "generation" in self.task - ): - raise ImportError( - f"Found an incompatible version of intel-extension-for-pytorch. Found version {_ipex_version}, " - f"but only version {IPEX_MINIMUM_VERSION} or higher is supported." - ) - if not isinstance(quantization_config, PostTrainingQuantConfig): if use_cpu: # will remove after intel-extension-for-transformers 1.3.3 release. quantization_config.device = "cpu" - quantization_config.post_init() + quantization_config.post_init_cpu() elif use_xpu: # will remove after intel-extension-for-transformers 1.3.3 release. quantization_config.device = "xpu" quantization_config.post_init_xpu() + self._quantized_model = convert_to_quantized_model( self._original_model, quantization_config, device=quantization_config.device ) - # will remove after intel-extension-for-transformers 1.3.3 release. - if hasattr(quantization_config, "calib_dataloader"): - quantization_config.calib_dataloader = None + self._quantized_model.quantization_config = quantization_config self._quantized_model.save_pretrained = types.MethodType(save_low_bit, self._quantized_model) - # Save the quantized model self._quantized_model.save_pretrained(save_directory) + else: if isinstance(self._original_model.config, PretrainedConfig): self._original_model.config.backend = quantization_config.backend @@ -336,10 +334,7 @@ def quantize( ) if not hasattr(compressed_model, "_model") or compressed_model._model is None: - raise RuntimeError( - "The maximum number of trials specified has been reached and no quantized model meeting the specified" - " accuracy tolerance has been found. Either the tolerance or the number of trials need to be increased." - ) + raise RuntimeError("Calling `neural_compressor.fit` returned unexpected results") if isinstance(self._original_model.config, PretrainedConfig): # If backend is IPEX, then the quantized model is JIT model which will drop the config attribute, @@ -376,7 +371,6 @@ def quantize( self._save_pretrained(compressed_model, output_path) quantization_config = INCConfig(quantization=quantization_config, save_onnx_model=save_onnx_model) quantization_config.save_pretrained(save_directory) - return self._quantized_model @staticmethod def _save_pretrained(model: Union[PyTorchModel, IPEXModel], output_path: str): diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py index fc20cdafeb..b6089746e8 100644 --- a/optimum/intel/neural_compressor/trainer.py +++ b/optimum/intel/neural_compressor/trainer.py @@ -62,7 +62,6 @@ is_accelerate_available, is_apex_available, is_sagemaker_mp_enabled, - is_torch_tpu_available, logging, ) @@ -73,6 +72,12 @@ from .configuration import INCConfig +if is_transformers_version(">=", "4.39.0"): + from transformers.utils import is_torch_xla_available +else: + from transformers.utils import is_torch_tpu_available as is_torch_xla_available + + if is_accelerate_available(): from accelerate import __version__ as accelerate_version from accelerate import skip_first_batches @@ -95,7 +100,7 @@ if is_sagemaker_mp_enabled(): import smdistributed.modelparallel.torch as smp -if is_torch_tpu_available(check_device=False): +if is_torch_xla_available(): import torch_xla.core.xla_model as xm @@ -517,7 +522,7 @@ def _inner_training_loop( if ( args.logging_nan_inf_filter - and not is_torch_tpu_available() + and not is_torch_xla_available() and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) ): # if loss is nan or inf simply add the average of previous logged losses @@ -611,7 +616,7 @@ def _inner_training_loop( logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: # Wait for everyone to get here so we are sure the model has been saved by process 0. - if is_torch_tpu_available(): + if is_torch_xla_available(): xm.rendezvous("load_best_model_at_end") elif args.parallel_mode == ParallelMode.DISTRIBUTED: dist.barrier() @@ -945,7 +950,7 @@ def get_model_sparsity(self): def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): # TODO : can be removed once transformers >= v4.38.0 if self.control.should_log and self.state.global_step > self._globalstep_last_logged: - if is_torch_tpu_available(): + if is_torch_xla_available(): xm.mark_step() logs: Dict[str, float] = {} diff --git a/optimum/intel/neural_compressor/trainer_seq2seq.py b/optimum/intel/neural_compressor/trainer_seq2seq.py index 123ced6030..27540cfb14 100644 --- a/optimum/intel/neural_compressor/trainer_seq2seq.py +++ b/optimum/intel/neural_compressor/trainer_seq2seq.py @@ -17,7 +17,7 @@ import torch from torch import nn from torch.utils.data import Dataset -from transformers.deepspeed import is_deepspeed_zero3_enabled +from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled from transformers.trainer_utils import PredictionOutput from transformers.utils import logging diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 27a966865f..b871668588 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -43,12 +43,13 @@ from .trainer import OVTrainer -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig from .modeling import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, OVModelForAudioXVector, OVModelForCTC, + OVModelForCustomTasks, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 40a60bb58e..1634222dd6 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -11,71 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import copy +import inspect +import logging from dataclasses import dataclass +from enum import Enum from typing import Any, Dict, List, Optional, Union import torch from transformers import PretrainedConfig -from transformers.utils.quantization_config import QuantizationConfigMixin +from transformers.utils.quantization_config import QuantizationConfigMixin, QuantizationMethod from optimum.configuration_utils import BaseConfig +from ..utils.import_utils import is_nncf_available -DEFAULT_QUANTIZATION_CONFIG = { - "algorithm": "quantization", - "preset": "mixed", - "overflow_fix": "disable", - "initializer": { - "range": {"num_init_samples": 300, "type": "mean_min_max"}, - "batchnorm_adaptation": {"num_bn_adaptation_samples": 0}, - }, - "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}}, - "ignored_scopes": [ - "{re}.*Embedding.*", - "{re}.*add___.*", - "{re}.*layer_norm_.*", - "{re}.*matmul_1", - "{re}.*__truediv__.*", - ], -} -INT8_WEIGHT_COMPRESSION_CONFIG = { - "algorithm": "quantization", - "weights": { - "mode": "symmetric", - "bits": 8, - "target_scopes": [ - "{re}.*Embedding.*", - "{re}.*matmul_.*", - "{re}.*addmm_.*", - "{re}.*baddmm_.*", - "{re}.*linear_.*", - ], - "ignored_scopes": [ - "{re}.*conv_*", - ], - }, - "activations": { - "ignored_scopes": [ - "{re}.*add___.*", - "{re}.*__radd___.*", - "{re}.*layer_norm_.*", - "{re}.*__truediv__.*", - "{re}.*__mul___.*", - "{re}.*__rmul___.*", - "{re}.*tanh_.*", - "{re}.*pow_.*", - "{re}.*matmul_.*", - "{re}.*addmm_.*", - "{re}.*baddmm_.*", - "{re}.*linear_.*", - "{re}.*conv_.*", - ], - }, - "overflow_fix": "disable", -} +if is_nncf_available(): + import nncf +logger = logging.getLogger(__name__) _DEFAULT_4BIT_CONFIGS = { "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5}, @@ -100,31 +55,81 @@ } +class OVQuantizationMethod(str, Enum): + DEFAULT = "default" + + +@dataclass +class OVQuantizationConfigBase(QuantizationConfigMixin): + """ + Base configuration class for quantization parameters + """ + + quant_method = OVQuantizationMethod.DEFAULT + + def __init__( + self, + ignored_scope: Optional[dict] = None, + num_samples: Optional[int] = None, + weight_only: Optional[bool] = None, + **kwargs, + ): + """ + Args: + ignored_scope (`dict`, *optional*): + An ignored scope that defines a list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. + num_samples (`int`, *optional*): + The maximum number of samples composing the calibration dataset. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. + """ + if isinstance(ignored_scope, nncf.IgnoredScope): + ignored_scope = ignored_scope.__dict__ + self.ignored_scope = ignored_scope + self.num_samples = num_samples + self.weight_only = weight_only + + def post_init(self): + try: + self.get_ignored_scope_instance() + except Exception as e: + raise ValueError( + f"Can't create an `IgnoredScope` object from the provided ignored scope dict: {self.ignored_scope}.\n{e}" + ) + if not (self.num_samples is None or isinstance(self.num_samples, int) and self.num_samples > 0): + raise ValueError(f"`num_samples` is expected to be a positive integer, but found: {self.num_samples}") + + def get_ignored_scope_instance(self) -> "nncf.IgnoredScope": + if self.ignored_scope is None: + return nncf.IgnoredScope() + return nncf.IgnoredScope(**copy.deepcopy(self.ignored_scope)) + + class OVConfig(BaseConfig): CONFIG_NAME = "openvino_config.json" FULL_CONFIGURATION_FILE = "openvino_config.json" def __init__( self, - compression: Union[List[Dict], Dict, None] = None, input_info: Optional[List] = None, save_onnx_model: bool = False, - quantization_config: Optional[Union[QuantizationConfigMixin, Dict]] = None, + quantization_config: Optional[Union[dict, OVQuantizationConfigBase]] = None, dtype: Optional[str] = None, **kwargs, ): super().__init__() - self.compression = compression self.input_info = input_info self.save_onnx_model = save_onnx_model - self._enable_standard_onnx_export_option() self.optimum_version = kwargs.pop("optimum_version", None) - self.quantization_config = quantization_config or {} + if isinstance(quantization_config, dict): + quantization_config = self._quantization_config_from_dict(quantization_config) + self.quantization_config = quantization_config + self.compression = None # A field for backward-compatability of training-time compression parameters - if isinstance(quantization_config, QuantizationConfigMixin): - bits = self.quantization_config.bits - else: - bits = self.quantization_config.get("bits", None) + bits = ( + self.quantization_config.bits if isinstance(self.quantization_config, OVWeightQuantizationConfig) else None + ) self.dtype = "int" + str(bits) if isinstance(bits, int) else dtype def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): @@ -137,41 +142,64 @@ def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): for name, value in model_inputs.items() ] - def save_pretrained(self, *args, **kwargs): - super().save_pretrained(*args, **kwargs) - - def _enable_standard_onnx_export_option(self): - # This method depends on self.save_onnx_model. - # save_onnx_model is defaulted to false so that the final model output is - # in OpenVINO IR to realize performance benefit in OpenVINO runtime. - # True value of save_onnx_model will save a model in onnx format. - if ( - isinstance(self.compression, dict) - and "algorithm" in self.compression - and self.compression["algorithm"] == "quantization" - ): - self.compression["export_to_onnx_standard_ops"] = self.save_onnx_model - elif isinstance(self.compression, list): - for i, algo_config in enumerate(self.compression): - if algo_config["algorithm"] == "quantization": - self.compression[i]["export_to_onnx_standard_ops"] = self.save_onnx_model + @staticmethod + def _quantization_config_from_dict(quantization_config: dict) -> OVQuantizationConfigBase: + wq_args = inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args + q_args = inspect.getfullargspec(OVQuantizationConfig.__init__).args + config_keys = quantization_config.keys() + matches_wq_config_signature = all(arg_name in wq_args for arg_name in config_keys) + matches_q_config_signature = all(arg_name in q_args for arg_name in config_keys) + if matches_wq_config_signature == matches_q_config_signature: + weight_only = quantization_config.get("weight_only", None) + if weight_only is None: + logger.warning( + "Can't determine type of OV quantization config. Please specify explicitly whether you intend to " + "run weight-only quantization or not with `weight_only` parameter. Creating an instance of " + "OVWeightQuantizationConfig." + ) + return OVWeightQuantizationConfig.from_dict(quantization_config) + matches_wq_config_signature = weight_only + + config_type = OVWeightQuantizationConfig if matches_wq_config_signature else OVQuantizationConfig + return config_type.from_dict(quantization_config) + + def _to_dict_safe(self, to_diff_dict: bool = False) -> Dict[str, Any]: + class ConfigStub: + def to_dict(self): + return None + + def to_diff_dict(self): + return None + + if self.quantization_config is None: + # Parent to_dict() implementation does not support quantization_config being None + self_copy = copy.deepcopy(self) + self_copy.quantization_config = ConfigStub() + result = self_copy.to_diff_dict() if to_diff_dict else self_copy.to_dict() + else: + result = super().to_diff_dict() if to_diff_dict else super().to_dict() + return result + + def to_dict(self) -> Dict[str, Any]: + return self._to_dict_safe(to_diff_dict=False) + + def to_diff_dict(self) -> Dict[str, Any]: + return self._to_dict_safe(to_diff_dict=True) @dataclass -class OVWeightQuantizationConfig(QuantizationConfigMixin): +class OVWeightQuantizationConfig(OVQuantizationConfigBase): """ This is a wrapper class about all possible attributes and features that you can play with a model that has been - loaded using `optimum-intel` api for quantization with NNCF. - + loaded using `optimum-intel` api for weight-only quantization with NNCF. For full model quantization please see + OVQuantizationConfig. Args: - bits (`int`, defaults to 8): The number of bits to quantize to. sym (`bool`, defaults to `False`): - Whether to use symetric quantization. - tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): + Whether to use symmetric quantization. + tokenizer (`str`, *optional*): The tokenizer used to process the dataset. You can pass either: - - A custom tokenizer object. - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. @@ -179,30 +207,37 @@ class OVWeightQuantizationConfig(QuantizationConfigMixin): using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. dataset (`str or List[str]`, *optional*): The dataset used for data-aware compression or quantization with NNCF. You can provide your own dataset - in a list of strings or just use the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] for LLLMs + in a list of strings or just use the one from the list ['wikitext','c4','c4-new','ptb','ptb-new'] for LLLMs or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models. + Alternatively, you can provide data objects via `calibration_dataset` argument + of `OVQuantizer.quantize()` method. ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM and the rest to INT8_ASYM). group_size (`int`, *optional*): The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. all_layers (`bool`, *optional*): - Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion. + Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision. sensitivity_metric (`str`, *optional*): The sensitivity metric for assigning quantization precision to layers. In order to preserve the accuracy of the model, the more sensitive layers receives a higher precision. ignored_scope (`dict`, *optional*): - An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization. + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. num_samples (`int`, *optional*): The maximum number of samples composing the calibration dataset. - + quant_method (`str`, defaults of OVQuantizationMethod.DEFAULT): + Weight compression method to apply. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building + the config from dictionary. """ def __init__( self, bits: int = 8, sym: bool = False, - tokenizer: Optional[Any] = None, + tokenizer: Optional[str] = None, dataset: Optional[Union[str, List[str]]] = None, ratio: float = 1.0, group_size: Optional[int] = None, @@ -210,8 +245,16 @@ def __init__( sensitivity_metric: Optional[str] = None, ignored_scope: Optional[dict] = None, num_samples: Optional[int] = None, + quant_method: Union[QuantizationMethod, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT, + weight_only: Optional[bool] = True, **kwargs, ): + if weight_only is False: + logger.warning( + "Trying to create an instance of `OVWeightQuantizationConfig` with `weight_only` being " + "False. Please check your configuration." + ) + super().__init__(ignored_scope, num_samples, True) self.bits = bits self.sym = sym self.tokenizer = tokenizer @@ -220,21 +263,25 @@ def __init__( self.ratio = ratio self.all_layers = all_layers self.sensitivity_metric = sensitivity_metric - self.ignored_scope = ignored_scope - self.num_samples = num_samples - self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release + self.quant_method = quant_method self.post_init() def post_init(self): r""" Safety checker that arguments are correct """ + super().post_init() if self.ratio is not None and not (0 <= self.ratio <= 1): raise ValueError("`ratio` must between 0 and 1.") if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: raise ValueError("`group_size` must be greater than 0 or equal to -1") + if not (self.dataset is None or isinstance(self.dataset, (str, list))): + raise ValueError( + f"Dataset must be a instance of either string or list of strings, but found {type(self.dataset)}. " + f"If you wish to provide a custom dataset please pass it via `calibration_dataset` argument." + ) if self.dataset is not None and isinstance(self.dataset, str): - llm_datasets = ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"] + llm_datasets = ["wikitext", "c4", "c4-new", "ptb", "ptb-new"] stable_diffusion_datasets = [ "conceptual_captions", "laion/220k-GPT4Vision-captions-from-LIVIS", @@ -259,6 +306,57 @@ def post_init(self): f"For 8-bit quantization, `group_size` is expected to be set to -1, but was set to {self.group_size}" ) + if self.tokenizer is not None and not isinstance(self.tokenizer, str): + raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}") + + +@dataclass +class OVQuantizationConfig(OVQuantizationConfigBase): + def __init__( + self, + sym: bool = False, + ignored_scope: Optional[dict] = None, + num_samples: Optional[int] = 300, + model_type: str = "transformer", + fast_bias_correction: bool = True, + overflow_fix: str = "disable", + weight_only: Optional[bool] = False, + **kwargs, + ): + """ + Configuration class containing parameters related to model quantization with NNCF. Compared to weight + compression, during quantization both weights and activations are converted to lower precision. + For weight-only model quantization please see OVWeightQuantizationConfig. + Args: + sym (`bool`, defaults to `False`): + Whether to use symmetric quantization on the activations. Symmetric quantization will be applied on the weights in any case. + ignored_scope (`dict`, *optional*): + An ignored scope that defines the list of model nodes to be ignored during quantization. Dictionary + entries provided via this argument are used to create an instance of `nncf.IgnoredScope` class. + num_samples (`int`, *optional*): + The maximum number of samples composing the calibration dataset. + model_type (`str`, defaults to "transformer"): + Model type is needed to specify additional patterns in the model. Supported only `transformer` now. + fast_bias_correction (`bool`, defaults to True): + Whether to apply fast or full bias correction algorithm. + overflow_fix (`str`, default to "disable"): + Parameter for controlling overflow fix setting. + weight_only (`bool`, *optional*): + Used to explicitly specify type of quantization (weight-only of full) to apply. Useful when building + the config from dictionary. + """ + if weight_only is True: + logger.warning( + "Trying to create an instance of `OVQuantizationConfig` with `weight_only` being True. " + "Please check your configuration." + ) + super().__init__(ignored_scope, num_samples, False) + self.sym = sym + self.model_type = model_type + self.fast_bias_correction = fast_bias_correction + self.overflow_fix = overflow_fix + self.post_init() + def _check_default_4bit_configs(config: PretrainedConfig): return _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None) diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index 8a816609fa..9c7c2b5258 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -43,6 +43,7 @@ CausalLMOutput, ImageClassifierOutput, MaskedLMOutput, + ModelOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, @@ -953,3 +954,66 @@ def forward( logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"] return TokenClassifierOutput(logits=logits) + + +CUSTOM_TASKS_EXAMPLE = """ + Example of custom tasks (e.g. a sentence transformers with a pooler head): + + ```python + >>> from transformers import {processor_class} + >>> from optimum.intel import {model_class} + + >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}") + + >>> inputs = tokenizer("I love burritos!", return_tensors="np") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooler_output = outputs.pooler_output + ``` +""" + + +@add_start_docstrings( + """ + OpenVINO Model for custom tasks. It can be used to leverage the inference acceleration for any single-file OpenVINO model, that may use custom inputs and outputs. + """, + MODEL_START_DOCSTRING, +) +class OVModelForCustomTasks(OVModel): + @add_start_docstrings_to_model_forward( + CUSTOM_TASKS_EXAMPLE.format( + processor_class=_TOKENIZER_FOR_DOC, + model_class="OVModelForCustomTasks", + checkpoint="IlyasMoutawwakil/sbert-all-MiniLM-L6-v2-with-pooler", + ) + ) + def forward(self, **kwargs): + expected_inputs_names = set(self.input_names) + inputs_names = set(kwargs) + + if not expected_inputs_names.issubset(inputs_names): + raise ValueError( + f"Got unexpected inputs: expecting the following inputs : {', '.join(expected_inputs_names)} but got : {', '.join(inputs_names)}." + ) + + np_inputs = isinstance(next(iter(kwargs.values())), np.ndarray) + inputs = {} + for input_name in self.input_names: + inputs[input_name] = np.array(kwargs.pop(input_name)) if not np_inputs else kwargs.pop(input_name) + + outputs = self.request(inputs) + + model_outputs = {} + for key, value in outputs.items(): + key_name = next(iter(key.names)) + if "." in key_name: + key_name = key_name.split(".")[0] + if key_name not in model_outputs: + model_outputs[key_name] = [] + model_outputs[key_name].append(torch.from_numpy(value).to(self.device) if not np_inputs else value) + else: + model_outputs[key_name] = torch.from_numpy(value).to(self.device) if not np_inputs else value + + return ModelOutput(**model_outputs) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index a6b8aacf43..d5b19bb28c 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -100,13 +100,21 @@ def __init__( self._openvino_config = OVConfig(quantization_config=quantization_config) @staticmethod - def load_model(file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None): + def load_model( + file_name: Union[str, Path], + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + calibration_dataset: Optional = None, + ): """ Loads the model. Arguments: file_name (`str` or `Path`): The path of the model ONNX or XML file. + quantization_config (`OVWeightQuantizationConfig` or `Dict`, *optional*): + Quantization config to apply after model is loaded. + calibration_dataset (`nncf.Dataset`, *optional*): + Optional nncf.Dataset to feed to model weight compression when quantization config is provided. """ def fix_op_names_duplicates(model: openvino.runtime.Model): @@ -135,7 +143,7 @@ def fix_op_names_duplicates(model: openvino.runtime.Model): from optimum.intel.openvino.quantization import _weight_only_quantization - model = _weight_only_quantization(model, quantization_config) + model = _weight_only_quantization(model, quantization_config, calibration_dataset=calibration_dataset) return model diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 10f0359a24..44137186e2 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -120,6 +120,7 @@ def __init__( self._original_model = self.model.clone() # keep original model for serialization self._pkv_precision = Type.f32 self.next_beam_idx = None + self._past_length = 0 self.update_pkv_precision() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) @@ -356,19 +357,14 @@ def prepare_inputs( position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> Dict: - if self.use_cache and past_key_values is not None: - input_ids = input_ids[:, -1:] - batch_size = input_ids.shape[0] if self.config.model_type == "bloom": batch_size *= self.config.num_attention_heads inputs = {} - past_len = 0 if not self.stateful: if past_key_values is not None: if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: - past_len = past_key_values[0][1].shape[-2] if self._pkv_precision == Type.bf16: # numpy does not support bf16, pretending f16, should change to bf16 past_key_values = tuple( @@ -381,8 +377,6 @@ def prepare_inputs( past_key_values = tuple( past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer ) - else: - past_len = past_key_values[0].shape[-2] # Add the past_key_values to the decoder inputs inputs = dict(zip(self.key_value_input_names, past_key_values)) @@ -411,6 +405,8 @@ def prepare_inputs( # Set initial value for the next beam_idx input that will be used at the current iteration # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used self.next_beam_idx = np.arange(batch_size, dtype=int) + self._past_length = 0 + past_len = self._get_past_length(past_key_values) inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed @@ -432,7 +428,7 @@ def prepare_inputs( position_ids = np.cumsum(attention_mask, axis=1) - 1 position_ids[attention_mask == 0] = 1 if past_key_values: - position_ids = np.expand_dims(position_ids[:, -1], axis=-1) + position_ids = position_ids[:, -input_ids.shape[1] :] inputs["position_ids"] = position_ids @@ -470,6 +466,7 @@ def forward( # the first condition at the function beginning above. # It should be something that is not None and it should be True when converted to Boolean. past_key_values = ((),) + self._past_length += input_ids.shape[1] if not self.stateful: if self.use_cache: @@ -485,19 +482,32 @@ def forward( return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation + # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly attention_mask = kwargs.get("attention_mask", None) use_cache = kwargs.get("use_cache", None) + if past_key_values is not None: + past_len = self._get_past_length(past_key_values) + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_len) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_len < input_ids.shape[1]: + input_ids = input_ids[:, past_len:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: + if attention_mask is not None and position_ids is None and "position_ids" in self.input_names: # create position_ids on the fly for batch generation position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past_key_values: - position_ids = position_ids[:, -1].unsqueeze(-1) + position_ids = position_ids[:, -input_ids.shape[1] :] return { "input_ids": input_ids, @@ -507,6 +517,24 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "attention_mask": attention_mask, } + def _get_past_length(self, past_key_values=None): + if past_key_values is None: + return 0 + if self.stateful: + return self._past_length + if self.config.model_type in MULTI_QUERY_ATTN_MODELS: + return past_key_values[0].shape[-2] + seq_length_dim = -2 + if self.config.model_type == "chatglm": + seq_length_dim = 0 + elif self.config.model_type == "qwen": + seq_length_dim = 1 + # input is tuple of pairs + if isinstance(past_key_values[0], (tuple, list)): + return past_key_values[0][1].shape[seq_length_dim] + # past key values comes after flattening + return past_key_values[1].shape[seq_length_dim] + # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor @@ -544,7 +572,7 @@ def _from_pretrained( from_onnx: bool = False, local_files_only: bool = False, load_in_8bit: bool = False, - quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None, **kwargs, ): model_path = Path(model_id) @@ -568,15 +596,16 @@ def _from_pretrained( quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) load_in_4bit = quantization_config.bits == 4 if quantization_config else False - model = cls.load_model(model_cache_path, quantization_config=None if load_in_4bit else quantization_config) + calibration_dataset = kwargs.get("calibration_dataset", None) + model = cls.load_model( + model_cache_path, + quantization_config=None if load_in_4bit else quantization_config, + calibration_dataset=calibration_dataset, + ) model_type = config.model_type.replace("_", "-") if model_type == "bloom": init_cls = OVBloomForCausalLM - elif model_type == "mpt": - init_cls = OVMPTForCausalLM - elif model_type == "opt": - init_cls = OVOPTForCausalLM elif model_type == "gpt-bigcode": init_cls = OVGPTBigCodeForCausalLM else: @@ -608,7 +637,7 @@ def _from_pretrained( f"For the given model, we recommend the following `quantization_config` : {default_config}" ) - if isinstance(quantization_config.dataset, str): + if calibration_dataset is None and isinstance(quantization_config.dataset, str): tokenizer = quantization_config.tokenizer or AutoTokenizer.from_pretrained(model_id) from optimum.gptq.data import get_dataset, prepare_dataset @@ -620,9 +649,9 @@ def _from_pretrained( dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) dataset = prepare_dataset(dataset) quantization_config = copy.deepcopy(quantization_config) - quantization_config.dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) + calibration_dataset = nncf.Dataset(dataset, lambda x: causal_model.prepare_inputs(**x)) - _weight_only_quantization(model, quantization_config) + _weight_only_quantization(model, quantization_config, calibration_dataset) return causal_model @@ -630,22 +659,12 @@ def _from_pretrained( class OVBloomForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - # only last token for input_ids if past is not None if past_key_values and not self.stateful: # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed if past_key_values[0][0].shape[0] == input_ids.shape[0]: past_key_values = self._convert_to_bloom_cache(past_key_values) - - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": None, - "attention_mask": attention_mask, - } + return super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, **kwargs) # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM._reorder_cache def _reorder_cache( @@ -712,36 +731,6 @@ def _convert_to_standard_cache( ) -class OVOPTForCausalLM(OVModelForCausalLM): - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": None, - "attention_mask": attention_mask, - } - - -class OVMPTForCausalLM(OVModelForCausalLM): - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": None, - "attention_mask": attention_mask, - } - - class OVGPTBigCodeForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM._reorder_cache def _reorder_cache( diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index eb407b4cd1..7bc7cca04c 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -387,7 +387,7 @@ def transform_fn(data_item): self.__call__(**inputs, height=height, width=width) else: self.__call__(*inputs, height=height, width=width) - if len(calibration_data) > num_samples: + if len(calibration_data) >= num_samples: break self.unet.request = self.unet.request.request @@ -671,7 +671,7 @@ def _compile(self): if ( "CACHE_DIR" not in self.ov_config.keys() and not str(self._model_dir).startswith(gettempdir()) - and self.device.lower().split(":")[0] == "gpu" + and "gpu" in self.device.lower() ): self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache") diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index a2579611a4..aae66c148b 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -18,14 +18,15 @@ import os from collections import deque from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +import datasets import nncf import openvino import torch import transformers -from nncf import CompressWeightsMode, IgnoredScope, SensitivityMetric -from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters +from nncf import CompressWeightsMode, SensitivityMetric +from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters, OverflowFix from nncf.torch import register_module from nncf.torch.initialization import PTInitializingDataLoader from openvino._offline_transformations import compress_quantize_weights_transformation @@ -46,7 +47,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available from ..utils.modeling_utils import get_model_device -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( MAX_ONNX_OPSET, @@ -203,39 +204,52 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs): def quantize( self, - calibration_dataset: "Dataset" = None, + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, save_directory: Union[str, Path] = None, ov_config: OVConfig = None, file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, + weights_only: bool = None, **kwargs, ): """ Quantize a model given the optimization specifications defined in `quantization_config`. Args: - calibration_dataset (`datasets.Dataset`): - The dataset to use for the calibration step. + calibration_dataset (`datasets.Dataset` or `nncf.Dataset` or `Iterable`, *optional*): + A collection of data samples to use for quantization calibration. Is optional for weight-only + quantization and is required for full quantization. save_directory (`Union[str, Path]`): The directory where the quantized model should be saved. - quantization_config (`OVConfig`, *optional*): - The configuration containing the parameters related to quantization. + ov_config (`OVConfig`, *optional*): + The configuration containing the parameters related to quantization. If not provided, 8-bit symmetric + weight-only quantization will be applied. file_name (`str`, *optional*): The model file name to use when saving the model. Overwrites the default file name `"model.onnx"`. - batch_size (`int`, defaults to 8): + batch_size (`int`, defaults to 1): The number of calibration samples to load per batch. data_collator (`DataCollator`, *optional*): The function to use to form a batch from a list of elements of the calibration dataset. remove_unused_columns (`bool`, defaults to `True`): - Whether or not to remove the columns unused by the model forward method. - weights_only (`bool`, defaults to `False`): + Whether to remove the columns unused by the model forward method. + weights_only (`bool`, *optional*): + Being deprecated. Compress weights to integer precision (8-bit by default) while keeping activations floating-point. Fits best for LLM footprint reduction and performance acceleration. Examples: + ```python + >>> from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM + >>> from transformers import AutoModelForCausalLM + >>> model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") + >>> quantizer = OVQuantizer.from_pretrained(model, task="text-generation") + >>> ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig()) + >>> quantizer.quantize(ov_config=ov_config, save_directory="./quantized_model") + >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") + ``` + ```python >>> from optimum.intel.openvino import OVQuantizer, OVModelForSequenceClassification >>> from transformers import AutoModelForSequenceClassification @@ -243,53 +257,51 @@ def quantize( >>> # or >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") >>> quantizer = OVQuantizer.from_pretrained(model, task="text-classification") - >>> quantizer.quantize(calibration_dataset=calibration_dataset, save_directory="./quantized_model") + >>> ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + >>> quantizer.quantize(calibration_dataset=dataset, ov_config=ov_config, save_directory="./quantized_model") >>> optimized_model = OVModelForSequenceClassification.from_pretrained("./quantized_model") ``` - - ```python - >>> from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM - >>> from transformers import AutoModelForCausalLM - >>> model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b") - >>> quantizer = OVQuantizer.from_pretrained(model, task="text-generation") - >>> quantizer.quantize(save_directory="./quantized_model", weights_only=True) - >>> optimized_model = OVModelForCausalLM.from_pretrained("./quantized_model") - ``` """ + # TODO: deprecate weights_only argument + if weights_only is not None: + logger.warning( + "`weights_only` argument is deprecated. In the future please provide `ov_config.quantization_config` " + "as an instance of OVWeightQuantizationConfig for weight-only compression or as an instance of " + "OVQuantizationConfig for full model quantization." + ) + if save_directory is None: # TODO : can be set to self.model.config.name_or_path for OVModels when not provided raise ValueError("`save_directory` needs to be specified") - if weights_only: - if calibration_dataset is not None: + + if ov_config is None: + ov_config = OVConfig() + if not isinstance(ov_config, OVConfig): + raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + quantization_config = ov_config.quantization_config + if quantization_config is None: + if (weights_only is None or weights_only is True) and calibration_dataset is None: + if weights_only is None: + logger.info( + "`quantization_config` was not provided, 8-bit asymmetric weight quantization will be applied." + ) + ov_config.quantization_config = OVWeightQuantizationConfig(bits=8) + else: logger.warning( - "`calibration_dataset` was provided but will not be used as `weights_only` is set to `True`." + "`quantization_config` was not provided, but calibration dataset was provided, assuming full " + "model quantization is intended. In the future, please provide `quantization_config` as an " + "instance of OVQuantizationConfig." ) - else: - if calibration_dataset is None: - raise ValueError( - "`calibration_dataset` is needed to compute the activations range during the calibration step and was not provided. " - "In case you only want to apply quantization on the weights, please set `weights_only=True`." - ) - quantization_config = kwargs.pop("quantization_config", None) - if quantization_config is not None: - logger.warning( - "The argument `quantization_config` is deprecated, and will be removed in optimum-intel v1.6.0, please use `ov_config` instead" - ) - ov_config = ov_config or quantization_config - - if ov_config is not None: - if not isinstance(ov_config, OVConfig): - raise TypeError(f"`ov_config` should be an `OVConfig`, but got: {type(ov_config)} instead.") + ov_config.quantization_config = OVQuantizationConfig() if isinstance(self.model, OVBaseModel): self._quantize_ovbasemodel( - calibration_dataset, + ov_config, save_directory, + calibration_dataset, batch_size, data_collator, remove_unused_columns, - weights_only, - ov_config, **kwargs, ) @@ -299,84 +311,99 @@ def quantize( "To convert a PyTorch model to OpenVINO, you can set `export=True` when loading your model as `OVModelForXxx.from_pretrained(..., export=True)`" ) self._quantize_torchmodel( - calibration_dataset, + ov_config, save_directory, + calibration_dataset, file_name, batch_size, data_collator, remove_unused_columns, - weights_only, + **kwargs, ) else: raise TypeError(f"Unsupported model type: {type(self.model)}") def _quantize_ovbasemodel( self, - calibration_dataset: "Dataset", + ov_config: OVConfig, save_directory: Union[str, Path], + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, - ov_config: OVConfig = None, **kwargs, ): save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) - if weights_only: - q_config = getattr(ov_config, "quantization_config", None) - # Use default 8-bit compression if not provided - q_config = q_config or OVWeightQuantizationConfig(bits=8, sym=True) - _weight_only_quantization(self.model.model, q_config) - + quantization_config = ov_config.quantization_config + if isinstance(quantization_config, OVWeightQuantizationConfig): + _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) self.model.save_pretrained(save_directory) + ov_config.save_pretrained(save_directory) return + if not isinstance(quantization_config, OVQuantizationConfig): + raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") - calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=calibration_dataset, - batch_size=batch_size, - remove_unused_columns=remove_unused_columns, - data_collator=data_collator, - ) + if isinstance(calibration_dataset, nncf.Dataset): + quantization_dataset = calibration_dataset + elif isinstance(calibration_dataset, datasets.Dataset): + calibration_dataloader = self._get_calibration_dataloader( + calibration_dataset=calibration_dataset, + batch_size=batch_size, + remove_unused_columns=remove_unused_columns, + data_collator=data_collator, + ) - if self.model.export_feature == "text-generation" and self.model.use_cache: - # Prefeth past_key_values - self.model.update_pkv_precision(True) - self.model.compile() - subset_size = kwargs.get("subset_size", 300) - collected_inputs = [] - - self.model.request = InferRequestWrapper(self.model.request, collected_inputs) - for _, data in enumerate(calibration_dataloader): - self.model.generate(**data, max_new_tokens=1) - if len(collected_inputs) >= subset_size: - break - self.model.request = self.model.request.request - calibration_dataloader = collected_inputs + if self.model.export_feature == "text-generation" and self.model.use_cache: + # Prefetch past_key_values + self.model.update_pkv_precision(True) + self.model.compile() + collected_inputs = [] + + self.model.request = InferRequestWrapper(self.model.request, collected_inputs) + try: + for data in calibration_dataloader: + self.model.generate(**data, max_new_tokens=1) + if len(collected_inputs) >= quantization_config.num_samples: + break + finally: + self.model.request = self.model.request.request + quantization_dataset = nncf.Dataset(collected_inputs) + else: + quantization_dataset = nncf.Dataset(calibration_dataloader) + else: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") + quantization_dataset = nncf.Dataset(calibration_dataset) # Actual model quantization - quantization_dataset = nncf.Dataset(calibration_dataloader) quantized_model = nncf.quantize( self.model.model, quantization_dataset, - model_type=nncf.ModelType.TRANSFORMER if not kwargs.get("model_type") else kwargs.get("model_type"), - fast_bias_correction=kwargs.get("fast_bias_correction", True), + subset_size=quantization_config.num_samples, + ignored_scope=quantization_config.get_ignored_scope_instance(), + model_type=nncf.ModelType(quantization_config.model_type), + preset=nncf.QuantizationPreset.PERFORMANCE if quantization_config.sym else nncf.QuantizationPreset.MIXED, + fast_bias_correction=quantization_config.fast_bias_correction, + advanced_parameters=nncf.AdvancedQuantizationParameters( + overflow_fix=OverflowFix(quantization_config.overflow_fix) + ), **kwargs, ) self.model.model = quantized_model self.model.save_pretrained(save_directory) + ov_config.save_pretrained(save_directory) def _quantize_torchmodel( self, - calibration_dataset: "Dataset", + ov_config: OVConfig, save_directory: Union[str, Path], + calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, - weights_only: bool = False, - save_onnx_model: bool = False, **kwargs, ): self._set_task() @@ -394,6 +421,7 @@ def _quantize_torchmodel( model_type=model_type, ) + save_onnx_model = ov_config.save_onnx_model onnx_file_name = ( ONNX_WEIGHTS_NAME if file_name is None and save_onnx_model else Path(ov_file_name).with_suffix(".onnx") ) @@ -412,7 +440,8 @@ def _quantize_torchmodel( stateful = ensure_stateful_is_available() and ensure_export_task_support_stateful(task) - if weights_only: + quantization_config = ov_config.quantization_config + if isinstance(quantization_config, OVWeightQuantizationConfig): if stateful: # patch model before weight compression model = patch_model_with_bettertransformer(model) @@ -426,6 +455,8 @@ def _quantize_torchmodel( nncf.compress_weights(model, dataset=nncf.Dataset([dummy_inputs])) else: + if not isinstance(quantization_config, OVQuantizationConfig): + raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") if stateful: logger.warn( "Quantization algorithm does not support optimized stateful models. " @@ -433,19 +464,33 @@ def _quantize_torchmodel( ) stateful = False - calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=calibration_dataset, - batch_size=batch_size, - remove_unused_columns=remove_unused_columns, - data_collator=data_collator, - ) - - quantization_dataset = nncf.Dataset(calibration_dataloader) + if isinstance(calibration_dataset, nncf.Dataset): + quantization_dataset = calibration_dataset + elif isinstance(calibration_dataset, datasets.Dataset): + calibration_dataloader = self._get_calibration_dataloader( + calibration_dataset=calibration_dataset, + batch_size=batch_size, + remove_unused_columns=remove_unused_columns, + data_collator=data_collator, + ) + quantization_dataset = nncf.Dataset(calibration_dataloader) + else: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") + quantization_dataset = nncf.Dataset(calibration_dataset) model = nncf.quantize( model, quantization_dataset, - model_type=nncf.ModelType.TRANSFORMER if not kwargs.get("model_type") else kwargs.get("model_type"), - fast_bias_correction=kwargs.get("fast_bias_correction", True), + subset_size=quantization_config.num_samples, + ignored_scope=quantization_config.get_ignored_scope_instance(), + model_type=nncf.ModelType(quantization_config.model_type), + preset=nncf.QuantizationPreset.PERFORMANCE + if quantization_config.sym + else nncf.QuantizationPreset.MIXED, + fast_bias_correction=quantization_config.fast_bias_correction, + advanced_parameters=nncf.AdvancedQuantizationParameters( + overflow_fix=OverflowFix(quantization_config.overflow_fix) + ), **kwargs, ) @@ -472,6 +517,8 @@ def _quantize_torchmodel( except FileNotFoundError: pass + ov_config.save_pretrained(save_directory) + @staticmethod def _save_pretrained(model: openvino.runtime.Model, output_path: str): compress_quantize_weights_transformation(model) @@ -503,7 +550,7 @@ def get_calibration_dataset( preprocess_batch: bool = True, use_auth_token: bool = False, cache_dir: Optional[str] = None, - ) -> "Dataset": + ) -> datasets.Dataset: """ Create the calibration `datasets.Dataset` to use for the post-training static quantization calibration step. @@ -580,18 +627,33 @@ def _remove_unused_columns(self, dataset: "Dataset"): def _weight_only_quantization( - model: openvino.runtime.Model, quantization_config: Union[OVWeightQuantizationConfig, Dict] + model: openvino.runtime.Model, + quantization_config: Union[OVWeightQuantizationConfig, Dict], + calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, ) -> openvino.runtime.Model: config = quantization_config if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) - dataset = config.dataset - - if config.dataset is not None and isinstance(config.dataset, str): - tokenizer = config.tokenizer - if isinstance(tokenizer, str): - tokenizer = AutoTokenizer.from_pretrained(tokenizer) + if config.dataset is not None and calibration_dataset is not None: + logger.info( + "Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only " + "quantization. Will rely on `calibration_dataset`." + ) + dataset = None + if calibration_dataset is not None: + if isinstance(calibration_dataset, datasets.Dataset): + raise ValueError( + "Providing calibration dataset as an instance of `datasets.Dataset` for OV weight-only " + "quantization is not supported. Please provide it as `nncf.Dataset` or as iterable of " + "model inputs." + ) + elif isinstance(calibration_dataset, nncf.Dataset): + dataset = calibration_dataset + else: + dataset = nncf.Dataset(calibration_dataset) + elif config.dataset is not None and isinstance(config.dataset, str): + tokenizer = AutoTokenizer.from_pretrained(config.tokenizer) from optimum.gptq.data import get_dataset, prepare_dataset @@ -603,10 +665,6 @@ def _weight_only_quantization( if isinstance(config.sensitivity_metric, str): sensitivity_metric = getattr(SensitivityMetric, config.sensitivity_metric.upper()) - ignored_scope = None - if isinstance(config.ignored_scope, dict): - ignored_scope = IgnoredScope(**config.ignored_scope) - if config.bits == 8: mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM else: @@ -619,10 +677,10 @@ def _weight_only_quantization( group_size=config.group_size, all_layers=config.all_layers, sensitivity_metric=sensitivity_metric, - # awq=config.quant_method == "awq", # TODO : remove and add it back once nncf v2.9.0 - ignored_scope=ignored_scope, + # awq=config.quant_method == QuantizationMethod.AWQ, # TODO : enable from nncf v2.9.0 + ignored_scope=config.get_ignored_scope_instance(), dataset=dataset, - # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 + # subset_size=config.num_samples if config.num_samples else 128, # TODO : enable from nncf v2.9.0 ) @@ -691,23 +749,23 @@ def _hybrid_quantization( """ ops_to_compress = _collect_ops_with_weights(model) - ignored_scope = quantization_config.ignored_scope if isinstance(quantization_config.ignored_scope, dict) else {} - ptq_ignored_scope = nncf.IgnoredScope(**ignored_scope) - ptq_ignored_scope.names += ops_to_compress - - wc_quantization_config = copy.deepcopy(quantization_config) - wc_quantization_config.ignored_scope = ignored_scope - wc_quantization_config.ignored_scope["types"] = ignored_scope.get("types", []) + ["Convolution"] - compressed_model = _weight_only_quantization(model, wc_quantization_config) + wc_config = copy.deepcopy(quantization_config) + wc_config.ignored_scope = wc_config.ignored_scope or {} + wc_config.ignored_scope["types"] = wc_config.ignored_scope.get("types", []) + ["Convolution"] + compressed_model = _weight_only_quantization(model, wc_config) + ptq_ignored_scope = quantization_config.get_ignored_scope_instance() + ptq_ignored_scope.names += ops_to_compress subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 quantized_model = nncf.quantize( model=compressed_model, calibration_dataset=nncf.Dataset(dataset), model_type=nncf.ModelType.TRANSFORMER, ignored_scope=ptq_ignored_scope, - # The SQ algo should be disabled for MatMul nodes because their weights are already compressed - advanced_parameters=nncf.AdvancedQuantizationParameters(AdvancedSmoothQuantParameters(matmul=-1)), + # SQ algo should be disabled for MatMul nodes because their weights are already compressed + advanced_parameters=nncf.AdvancedQuantizationParameters( + smooth_quant_alphas=AdvancedSmoothQuantParameters(matmul=-1) + ), subset_size=subset_size, ) return quantized_model diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index b7d110c96a..873b0909c8 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -80,7 +80,6 @@ is_accelerate_available, is_apex_available, is_sagemaker_mp_enabled, - is_torch_tpu_available, logging, ) @@ -89,7 +88,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import is_transformers_version -from .configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig +from .configuration import OVConfig from .quantization import OVDataLoader from .training_args import OVTrainingArguments from .utils import ( @@ -101,6 +100,11 @@ ) +if is_transformers_version(">=", "4.39.0"): + from transformers.utils import is_torch_xla_available +else: + from transformers.utils import is_torch_tpu_available as is_torch_xla_available + if is_accelerate_available(): from accelerate import __version__ as accelerate_version from accelerate import skip_first_batches @@ -123,7 +127,7 @@ if is_sagemaker_mp_enabled(): import smdistributed.modelparallel.torch as smp -if is_torch_tpu_available(check_device=False): +if is_torch_xla_available(): import torch_xla.core.xla_model as xm core = Core() @@ -136,6 +140,25 @@ NNCF_LOG_FILE_NAME = "nncf_output.log" +DEFAULT_QUANTIZATION_CONFIG = { + "algorithm": "quantization", + "preset": "mixed", + "overflow_fix": "disable", + "initializer": { + "range": {"num_init_samples": 300, "type": "mean_min_max"}, + "batchnorm_adaptation": {"num_bn_adaptation_samples": 0}, + }, + "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}}, + "ignored_scopes": [ + "{re}.*Embedding.*", + "{re}.*add___.*", + "{re}.*layer_norm_.*", + "{re}.*matmul_1", + "{re}.*__truediv__.*", + ], +} + + def _onnx_export_nncf_model(model: NNCFNetwork, config: OnnxConfig, output: Union[str, io.BytesIO], opset: int = None): # TODO: remove it when fix controller.strip(copy=True) behavior signature = inspect.signature(model.forward) @@ -228,6 +251,16 @@ def __init__( if self.ov_config is not None: if self.ov_config.compression is None: self.ov_config.compression = DEFAULT_QUANTIZATION_CONFIG + if ( + isinstance(self.ov_config.compression, dict) + and "algorithm" in self.ov_config.compression + and self.ov_config.compression["algorithm"] == "quantization" + ): + self.ov_config.compression["export_to_onnx_standard_ops"] = self.ov_config.save_onnx_model + elif isinstance(self.ov_config.compression, list): + for i, algo_config in enumerate(self.ov_config.compression): + if algo_config["algorithm"] == "quantization": + self.ov_config.compression[i]["export_to_onnx_standard_ops"] = self.ov_config.save_onnx_model if self.args.do_train: self._set_task() @@ -611,7 +644,7 @@ def _inner_training_loop( if ( args.logging_nan_inf_filter - and not is_torch_tpu_available() + and not is_torch_xla_available() and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) ): # if loss is nan or inf simply add the average of previous logged losses @@ -706,7 +739,7 @@ def _inner_training_loop( logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: # Wait for everyone to get here so we are sure the model has been saved by process 0. - if is_torch_tpu_available(): + if is_torch_xla_available(): xm.rendezvous("load_best_model_at_end") elif args.parallel_mode == ParallelMode.DISTRIBUTED: dist.barrier() @@ -799,7 +832,7 @@ def compute_loss(self, model, inputs, return_outputs=False): def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): if self.control.should_log: - if is_torch_tpu_available(): + if is_torch_xla_available(): xm.mark_step() logs: Dict[str, float] = {} diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index a0439d2129..4d1479f733 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -96,6 +96,7 @@ "stable-diffusion": "OVStableDiffusionPipeline", "stable-diffusion-xl": "OVStableDiffusionXLPipeline", "pix2struct": "OVModelForPix2Struct", + "latent-consistency": "OVLatentConsistencyModelPipeline", } diff --git a/optimum/intel/utils/dummy_openvino_and_nncf_objects.py b/optimum/intel/utils/dummy_openvino_and_nncf_objects.py index 8ae3135667..e646074e1e 100644 --- a/optimum/intel/utils/dummy_openvino_and_nncf_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_nncf_objects.py @@ -46,3 +46,25 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "nncf"]) + + +class OVWeightQuantizationConfig(metaclass=DummyObject): + _backends = ["openvino", "nncf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "nncf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "nncf"]) + + +class OVQuantizationConfig(metaclass=DummyObject): + _backends = ["openvino", "nncf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "nncf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "nncf"]) diff --git a/optimum/intel/utils/dummy_openvino_objects.py b/optimum/intel/utils/dummy_openvino_objects.py index d5e42851da..5af3222d86 100644 --- a/optimum/intel/utils/dummy_openvino_objects.py +++ b/optimum/intel/utils/dummy_openvino_objects.py @@ -189,14 +189,3 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino"]) - - -class OVWeightQuantizationConfig(metaclass=DummyObject): - _backends = ["openvino"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["openvino"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["openvino"]) diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 08a9ec1f88..fcdf932a28 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import functools import importlib.util import logging import operator as op @@ -95,32 +95,6 @@ except ImportError: _openvino_available = False -_openvino_tokenizers_available = importlib.util.find_spec("openvino_tokenizers") is not None and _openvino_available -_openvino_tokenizers_version = "N/A" -if _openvino_tokenizers_available: - try: - _openvino_tokenizers_version = importlib_metadata.version("openvino_tokenizers") - except importlib_metadata.PackageNotFoundError: - _openvino_tokenizers_available = False - -if _openvino_tokenizers_available and _openvino_tokenizers_version != "N/A": - _compatible_openvino_version = next( - ( - requirement.split("==")[-1] - for requirement in importlib_metadata.requires("openvino-tokenizers") - if requirement.startswith("openvino==") - ), - "", - ) - _openvino_tokenizers_available = _compatible_openvino_version == ov_major_version - if not _openvino_tokenizers_available: - logger.warning( - "OpenVINO Tokenizer version is not compatible with OpenVINO version. " - f"Installed OpenVINO version: {ov_major_version}," - f"OpenVINO Tokenizers requires {_compatible_openvino_version}. " - f"OpenVINO Tokenizers models will not be added during export." - ) - _nncf_available = importlib.util.find_spec("nncf") is not None _nncf_version = "N/A" if _nncf_available: @@ -196,8 +170,81 @@ def is_openvino_available(): return _openvino_available +@functools.lru_cache(1) def is_openvino_tokenizers_available(): - return _openvino_tokenizers_available + if not is_openvino_available(): + return False + + if importlib.util.find_spec("openvino_tokenizers") is None: + logger.info( + "OpenVINO Tokenizers is not available. To deploy models in production " + "with C++ code, please follow installation instructions: " + "https://github.com/openvinotoolkit/openvino_tokenizers?tab=readme-ov-file#installation\n" + ) + return False + + try: + pip_metadata_version = importlib_metadata.version("openvino") + except importlib_metadata.PackageNotFoundError: + pip_metadata_version = False + try: + pip_metadata_version = importlib_metadata.version("openvino-nightly") + is_nightly = True + except importlib_metadata.PackageNotFoundError: + is_nightly = False + + try: + import openvino_tokenizers + + openvino_tokenizers._get_factory() + except RuntimeError: + tokenizers_version = openvino_tokenizers.__version__ + + if tokenizers_version == "0.0.0.0": + try: + tokenizers_version = importlib_metadata.version("openvino_tokenizers") or tokenizers_version + except importlib_metadata.PackageNotFoundError: + pass + message = ( + "OpenVINO and OpenVINO Tokenizers versions are not binary compatible.\n" + f"OpenVINO version: {_openvino_version}\n" + f"OpenVINO Tokenizers version: {tokenizers_version}\n" + "First 3 numbers should be the same. Update OpenVINO Tokenizers to compatible version. " + ) + if not pip_metadata_version: + message += ( + "For archive installation of OpenVINO try to build OpenVINO Tokenizers from source: " + "https://github.com/openvinotoolkit/openvino_tokenizers/tree/master?tab=readme-ov-file" + "#build-and-install-from-source" + ) + if sys.platform == "linux": + message += ( + "\nThe PyPI version of OpenVINO Tokenizers is built on CentOS and may not be compatible with other " + "Linux distributions; rebuild OpenVINO Tokenizers from source." + ) + else: + message += ( + "It is recommended to use the same day builds for pre-release version. " + "To install both OpenVINO and OpenVINO Tokenizers release version perform:\n" + ) + if is_nightly: + message += "pip uninstall -y openvino-nightly && " + message += "pip install --force-reinstall openvino openvino-tokenizers\n" + if is_nightly: + message += ( + "openvino-nightly package will be deprecated in the future - use pre-release drops instead. " + ) + message += "To update both OpenVINO and OpenVINO Tokenizers to the latest pre-release version perform:\n" + if is_nightly: + message += "pip uninstall -y openvino-nightly && " + message += ( + "pip install --pre -U openvino openvino-tokenizers " + "--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly" + ) + logger.warning(message) + return False + + return True def is_nncf_available(): diff --git a/setup.py b/setup.py index e80d0ea448..ea87e6ad59 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", "transformers>=4.36.0,<4.40.0", - "optimum~=1.18", + "optimum~=1.19", "datasets>=1.4.0", "sentencepiece", "scipy", @@ -52,18 +52,15 @@ "auto-gptq", "transformers_stream_generator", "einops", + "tiktoken", + "sentence_transformers", ] QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] EXTRAS_REQUIRE = { - "neural-compressor": [ - "neural-compressor>=2.2.0", - "onnxruntime<1.15.0", - "accelerate", - ], - "openvino": ["openvino>=2023.3", "nncf>=2.8.1"], - "openvino-tokenizers": ["openvino-tokenizers[transformers]"], + "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], + "openvino": ["openvino>=2023.3", "nncf>=2.8.1", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.8.1"], "ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"], "diffusers": ["diffusers"], diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index bd23d4d093..94a5ca9e16 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -103,6 +103,7 @@ def __exit__(self, type, value, traceback): class IPEXModelTest(unittest.TestCase): + IPEX_MODEL_CLASS = IPEXModel SUPPORTED_ARCHITECTURES = ( "albert", "bert", @@ -115,8 +116,6 @@ class IPEXModelTest(unittest.TestCase): "xlm", ) - IPEX_MODEL_CLASS = IPEXModel - @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -150,11 +149,11 @@ def test_pipeline(self, model_arch): class IPEXModelForSequenceClassificationTest(IPEXModelTest): - IPEX_MODEL_CLASS = IPEXModelForTokenClassification + IPEX_MODEL_CLASS = IPEXModelForSequenceClassification class IPEXModelForTokenClassificationTest(IPEXModelTest): - IPEX_MODEL_CLASS = IPEXModelForSequenceClassification + IPEX_MODEL_CLASS = IPEXModelForTokenClassification class IPEXModelForMaskedLMTest(IPEXModelTest): @@ -162,6 +161,7 @@ class IPEXModelForMaskedLMTest(IPEXModelTest): class IPEXModelForQuestionAnsweringTest(unittest.TestCase): + IPEX_MODEL_CLASS = IPEXModelForQuestionAnswering SUPPORTED_ARCHITECTURES = ( "bert", "distilbert", @@ -202,6 +202,7 @@ def test_pipeline(self, model_arch): class IPEXModelForCausalLMTest(unittest.TestCase): + IPEX_MODEL_CLASS = IPEXModelForCausalLM SUPPORTED_ARCHITECTURES = ( "bart", "gpt_bigcode", @@ -252,7 +253,7 @@ def test_compare_to_transformers(self, model_arch): def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) - model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False) + model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) model.config.encoder_no_repeat_ngram_size = 0 model.to("cpu") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) @@ -382,6 +383,7 @@ def test_pipeline(self, model_arch): class IPEXModelForImageClassificationIntegrationTest(unittest.TestCase): + IPEX_MODEL_CLASS = IPEXModelForImageClassification SUPPORTED_ARCHITECTURES = ( "beit", # "levit", @@ -391,7 +393,6 @@ class IPEXModelForImageClassificationIntegrationTest(unittest.TestCase): "resnet", "vit", ) - IPEX_MODEL_CLASS = IPEXModelForImageClassification @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): diff --git a/tests/neural_compressor/test_ipex.py b/tests/neural_compressor/test_ipex.py new file mode 100644 index 0000000000..ef16dbed19 --- /dev/null +++ b/tests/neural_compressor/test_ipex.py @@ -0,0 +1,86 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ruff: noqa + + +import os +import tempfile + +from neural_compressor.config import PostTrainingQuantConfig + +from parameterized import parameterized +from transformers import ( + AutoModelForCausalLM, + AutoModelForQuestionAnswering, + AutoTokenizer, + set_seed, +) +from utils_tests import MODEL_NAMES, SEED, INCTestMixin, _generate_dataset + + +from optimum.intel import ( + INCConfig, + INCModelForCausalLM, + INCModelForSeq2SeqLM, + INCModelForQuestionAnswering, + INCModelForSequenceClassification, + INCModelForMaskedLM, + INCModelForTokenClassification, + INCQuantizer, + INCSeq2SeqTrainer, +) +from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification +from optimum.pipelines import ORT_SUPPORTED_TASKS + + +os.environ["CUDA_VISIBLE_DEVICES"] = "" +set_seed(SEED) + + +class IPEXQuantizationTest(INCTestMixin): + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("text-classification", "bert", 21),) + + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) + def test_ipex_static_quantization_with_smoothquant(self, task, model_arch, expected_quantized_matmuls): + recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": 0.5}} + num_samples = 10 + model_name = MODEL_NAMES[model_arch] + quantization_config = PostTrainingQuantConfig(approach="static", backend="ipex", recipes=recipes) + model = ORT_SUPPORTED_TASKS[task]["class"][0].auto_model_class.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + quantizer = INCQuantizer.from_pretrained(model, task=task) + calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) + + with tempfile.TemporaryDirectory() as tmp_dir: + quantizer.quantize( + quantization_config=quantization_config, + calibration_dataset=calibration_dataset, + save_directory=tmp_dir, + save_onnx_model=False, + ) + self.check_model_outputs( + q_model=quantizer._quantized_model, + task=task, + tokenizer=tokenizer, + save_directory=tmp_dir, + expected_quantized_matmuls=expected_quantized_matmuls, + is_static=True, + load_onnx_model=False, + num_samples=num_samples, + load_inc_model=False, + load_ipex_model=True, + ) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 9d85b85cbd..e38ba8e327 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -44,7 +44,7 @@ pipeline, set_seed, ) -from utils_tests import SEED, INCTestMixin, _generate_dataset +from utils_tests import MODEL_NAMES, SEED, INCTestMixin, _generate_dataset from optimum.intel.utils.import_utils import is_torch_version, is_intel_extension_for_transformers_available @@ -64,38 +64,30 @@ from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification from optimum.pipelines import ORT_SUPPORTED_TASKS -if is_intel_extension_for_transformers_available(): - from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig os.environ["CUDA_VISIBLE_DEVICES"] = "" set_seed(SEED) -class OptimizationTest(INCTestMixin): +class QuantizationTest(INCTestMixin): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( - ("text-classification", "hf-internal-testing/tiny-random-BertForSequenceClassification", 21), - # ("text-generation", "hf-internal-testing/tiny-random-BloomForCausalLM", 21), # TODO : enable causal lm task once INC ONNX export fixed + ("text-classification", "bert", 21), + # ("text-generation", "bloom", 21), ) SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS + ( - ("fill-mask", "hf-internal-testing/tiny-random-BertForMaskedLM", 22), - ("token-classification", "hf-internal-testing/tiny-random-AlbertForTokenClassification", 26), + ("fill-mask", "bert", 22), + ("token-classification", "albert", 26), ) TEXT_GENERATION_SUPPORTED_ARCHITECTURES = ( - "hf-internal-testing/tiny-random-BloomForCausalLM", - "hf-internal-testing/tiny-random-GPTNeoForCausalLM", - ) - - WEIGHT_ONLY_CONFIG = ( - ("RTN", "int4_clip"), - ("GPTQ", "int4_clip"), - ("RTN", "int8"), - ("", ""), + "bloom", + "gpt_neo", ) @parameterized.expand(SUPPORTED_ARCHITECTURES_DYNAMIC) - def test_dynamic_quantization(self, task, model_name, expected_quantized_matmuls): + def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls): + model_name = MODEL_NAMES[model_arch] quantization_config = PostTrainingQuantConfig(approach="dynamic") model_class = ORT_SUPPORTED_TASKS[task]["class"][0] tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -130,8 +122,9 @@ def test_dynamic_quantization(self, task, model_name, expected_quantized_matmuls ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_static_quantization(self, task, model_name, expected_quantized_matmuls): + def test_static_quantization(self, task, model_arch, expected_quantized_matmuls): num_samples = 10 + model_name = MODEL_NAMES[model_arch] model_class = ORT_SUPPORTED_TASKS[task]["class"][0] tokenizer = AutoTokenizer.from_pretrained(model_name) if tokenizer.pad_token is None: @@ -175,82 +168,6 @@ def test_static_quantization(self, task, model_name, expected_quantized_matmuls) num_samples=num_samples, ) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - @unittest.skipIf(is_torch_version(">=", "2.2.0"), "compatibility issue with torch 2.2.0 and IPEX latest version") - def test_ipex_static_quantization_with_smoothquant(self, task, model_name, expected_quantized_matmuls): - recipes = {"smooth_quant": True, "smooth_quant_args": {"alpha": 0.5}} - num_samples = 10 - quantization_config = PostTrainingQuantConfig(approach="static", backend="ipex", recipes=recipes) - model = ORT_SUPPORTED_TASKS[task]["class"][0].auto_model_class.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - quantizer = INCQuantizer.from_pretrained(model, task=task) - calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) - - with tempfile.TemporaryDirectory() as tmp_dir: - quantizer.quantize( - quantization_config=quantization_config, - calibration_dataset=calibration_dataset, - save_directory=tmp_dir, - save_onnx_model=False, - ) - self.check_model_outputs( - q_model=quantizer._quantized_model, - task=task, - tokenizer=tokenizer, - save_directory=tmp_dir, - expected_quantized_matmuls=expected_quantized_matmuls, - is_static=True, - load_onnx_model=False, - num_samples=num_samples, - load_inc_model=False, - load_ipex_model=True, - ) - - @parameterized.expand(WEIGHT_ONLY_CONFIG) - @unittest.skipIf( - not is_intel_extension_for_transformers_available(), reason="Intel-extension-for-transformers not available!" - ) - def test_weight_only_quantization(self, methodology, weight_dtype): - model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM" - model = AutoModelForCausalLM.from_pretrained(model_name) - tokenizer = AutoTokenizer.from_pretrained(model_name) - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation") - calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2) - - with tempfile.TemporaryDirectory() as tmp_dir: - if methodology: - gptq_args = { - "percdamp": 0.01, - "act_order": False, - "scheme": "sym", - } - - quantization_config = WeightOnlyQuantConfig( - algorithm=methodology, - algorithm_args=gptq_args if methodology == "GPTQ" else None, - weight_dtype=weight_dtype, - ) - quantizer.quantize( - quantization_config=quantization_config, - calibration_dataset=calibration_dataset, - save_directory=tmp_dir, - ) - else: - quantizer.quantize( - quantization_config=None, - save_directory=tmp_dir, - weight_only=True, # use RTN quantization method and NF4 weight data type is default. - ) - - q_model = INCModelForCausalLM.from_pretrained(tmp_dir) - inp = torch.tensor([calibration_dataset[0]["input_ids"]]) - out = model(inp)[0] - q_out = q_model(inp)[0] - self.assertTrue(torch.all(torch.isclose(out, q_out, atol=5e-1))) - def test_dynamic_accuracy_strategy_quantization(self): model_name = "distilbert-base-cased-distilled-squad" model = AutoModelForQuestionAnswering.from_pretrained(model_name) @@ -330,7 +247,8 @@ def test_dynamic_diffusion_model(self): self.assertTrue(np.allclose(loaded_pipe_outputs, outputs, atol=1e-4)) @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES) - def test_quantize_text_generate_model(self, model_id): + def test_quantize_text_generate_model(self, model_arch): + model_id = MODEL_NAMES[model_arch] set_seed(42) model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -357,8 +275,13 @@ def calibration_fn(p_model): outputs = model.generate(**tokens, do_sample=False, num_beams=1, temperature=0.9, min_length=20, max_length=20) self.assertTrue(torch.equal(pre_outputs, outputs)) + +class TrainingOptimizationTest(INCTestMixin): + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("text-classification", "bert", 21),) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_aware_training_quantization(self, task, model_name, expected_quantized_matmuls): + def test_aware_training_quantization(self, task, model_arch, expected_quantized_matmuls): + model_name = MODEL_NAMES[model_arch] quantization_config = QuantizationAwareTrainingConfig() save_onnx_model = False @@ -381,7 +304,8 @@ def test_aware_training_quantization(self, task, model_name, expected_quantized_ ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_aware_training_quantization_pruning(self, task, model_name, expected_quantized_matmuls): + def test_aware_training_quantization_pruning(self, task, model_arch, expected_quantized_matmuls): + model_name = MODEL_NAMES[model_arch] quantization_config = QuantizationAwareTrainingConfig() target_sparsity = 0.9 pruning_config = WeightPruningConfig( @@ -413,7 +337,8 @@ def test_aware_training_quantization_pruning(self, task, model_name, expected_qu ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_magnitude_pruning(self, task, model_name, expected_quantized_matmuls): + def test_magnitude_pruning(self, task, model_arch, expected_quantized_matmuls): + model_name = MODEL_NAMES[model_arch] target_sparsity = 0.9 # end_step should be training_args.num_train_epochs * (len(train_dataset) // training_args.per_device_train_batch_size) pruning_config = WeightPruningConfig( @@ -452,7 +377,8 @@ def test_magnitude_pruning(self, task, model_name, expected_quantized_matmuls): self.assertEqual(inc_config.pruning["pattern"], "4x1") @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) - def test_distillation(self, task, model_name, expected_quantized_matmuls): + def test_distillation(self, task, model_arch, expected_quantized_matmuls): + model_name = MODEL_NAMES[model_arch] teacher_model = ORT_SUPPORTED_TASKS[task]["class"][0].auto_model_class.from_pretrained(model_name) distillation_config = DistillationConfig(teacher_model=teacher_model) save_onnx_model = True @@ -575,3 +501,54 @@ def _compute_metrics(pred): self.assertIsInstance(loaded_model_outputs.logits, torch.Tensor) # Compare tensor outputs # self.assertTrue(torch.allclose(loaded_model_outputs.logits, model_outputs.logits, atol=1e-4)) + + +class WeightOnlyQuantizationTest(INCTestMixin): + WEIGHT_ONLY_CONFIG = ( + ("rtn", "int4_clip"), + ("rtn", "int8"), + ("gptq", "int4_clip"), + ) + + @parameterized.expand(WEIGHT_ONLY_CONFIG) + @unittest.skipIf(not is_intel_extension_for_transformers_available(), reason="ITREX not available") + def test_weight_only_quantization(self, methodology, weight_dtype): + model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM" + + from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig + + bits = 4 if "4" in weight_dtype else 8 + if methodology == "gptq": + # max_input_length can be removed after neural-compressor > v2.5.1 + quantization_config = GPTQConfig( + bits=bits, sym=True, damp_percent=0.01, weight_dtype=weight_dtype, max_input_length=128 + ) + else: + quantization_config = RtnConfig(bits=bits, weight_dtype=weight_dtype) + + model = AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + quantizer = INCQuantizer.from_pretrained(copy.deepcopy(model), task="text-generation") + calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=2) + + with tempfile.TemporaryDirectory() as tmp_dir: + quantizer.quantize( + quantization_config=quantization_config, + calibration_dataset=calibration_dataset, + save_directory=tmp_dir, + ) + loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir) + + tokens = tokenizer("This is a sample output", return_tensors="pt") + + with torch.no_grad(): + loaded_outputs = loaded_model(**tokens) + # quantizer_outputs = model(**tokens) + + self.assertTrue("logits" in loaded_outputs) + self.assertIsInstance(loaded_outputs.logits, torch.Tensor) + self.assertTrue("past_key_values" in loaded_outputs) + self.assertIsInstance(loaded_outputs.past_key_values, tuple) + + # self.assertTrue(torch.allclose(quantizer_outputs.logits, loaded_outputs.logits, equal_nan=True, atol=1e-4)) diff --git a/tests/neural_compressor/utils_tests.py b/tests/neural_compressor/utils_tests.py index 214aa73be5..c91270355a 100644 --- a/tests/neural_compressor/utils_tests.py +++ b/tests/neural_compressor/utils_tests.py @@ -41,15 +41,15 @@ ) -from optimum.intel.utils.import_utils import is_torch_version +from optimum.intel.utils.import_utils import is_ipex_available from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS from optimum.intel.utils.constant import ONNX_WEIGHTS_NAME from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification from optimum.pipelines import ORT_SUPPORTED_TASKS -if is_torch_version("<", "2.2.0"): - from optimum.intel.ipex import ( +if is_ipex_available(): + from optimum.intel import ( IPEXModelForCausalLM, IPEXModelForSequenceClassification, IPEXModelForMaskedLM, @@ -65,6 +65,50 @@ } +MODEL_NAMES = { + "albert": "hf-internal-testing/tiny-random-albert", + "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", + "bert": "hf-internal-testing/tiny-random-bert", + "bart": "hf-internal-testing/tiny-random-bart", + "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", + "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", + "bloom": "hf-internal-testing/tiny-random-BloomModel", + "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", + "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", + "convnext": "hf-internal-testing/tiny-random-convnext", + "distilbert": "hf-internal-testing/tiny-random-distilbert", + "electra": "hf-internal-testing/tiny-random-electra", + "flaubert": "hf-internal-testing/tiny-random-flaubert", + "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", + "gpt2": "hf-internal-testing/tiny-random-gpt2", + "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", + "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", + "gptj": "hf-internal-testing/tiny-random-GPTJModel", + "levit": "hf-internal-testing/tiny-random-LevitModel", + "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama2": "Jiqing/tiny_random_llama2", + "marian": "sshleifer/tiny-marian-en-de", + "mbart": "hf-internal-testing/tiny-random-mbart", + "mistral": "echarlaix/tiny-random-mistral", + "mobilenet_v1": "google/mobilenet_v1_0.75_192", + "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", + "mobilevit": "hf-internal-testing/tiny-random-mobilevit", + "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", + "mt5": "stas/mt5-tiny-random", + "opt": "hf-internal-testing/tiny-random-OPTModel", + "phi": "echarlaix/tiny-random-PhiForCausalLM", + "resnet": "hf-internal-testing/tiny-random-resnet", + "roberta": "hf-internal-testing/tiny-random-roberta", + "roformer": "hf-internal-testing/tiny-random-roformer", + "squeezebert": "hf-internal-testing/tiny-random-squeezebert", + "t5": "hf-internal-testing/tiny-random-t5", + "unispeech": "hf-internal-testing/tiny-random-unispeech", + "vit": "hf-internal-testing/tiny-random-vit", + "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", + "xlm": "hf-internal-testing/tiny-random-xlm", +} + + def num_quantized_matmul_onnx_model(onnx_model): num_quantized_matmul = 0 for node in onnx_model.graph.node: @@ -145,8 +189,7 @@ def check_model_outputs( ort_model = ORT_SUPPORTED_TASKS[task]["class"][0].from_pretrained(save_directory, **model_kwargs) ort_outputs = ort_model(**tokens) self.assertTrue("logits" in ort_outputs) - if task != "fill-mask": - self.assertTrue(torch.allclose(ort_outputs.logits, outputs, atol=1e-2)) + # self.assertTrue(torch.allclose(ort_outputs.logits, outputs, atol=1e-2)) @staticmethod def get_trainer( diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 21bec021f8..9d1daaab63 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -19,15 +19,18 @@ from typing import Optional from parameterized import parameterized +from transformers import AutoConfig from utils_tests import MODEL_NAMES from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED -from optimum.exporters.openvino import export_from_model +from optimum.exporters.onnx.model_configs import BertOnnxConfig +from optimum.exporters.openvino import export_from_model, main_export from optimum.exporters.tasks import TasksManager from optimum.intel import ( OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, + OVModelForCustomTasks, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, @@ -114,3 +117,39 @@ def _openvino_export( @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_export(self, model_type: str): self._openvino_export(model_type) + + +class CustomExportModelTest(unittest.TestCase): + def test_export_custom_model(self): + class BertOnnxConfigWithPooler(BertOnnxConfig): + @property + def outputs(self): + if self.task == "feature-extraction-with-pooler": + common_outputs = {} + common_outputs["last_hidden_state"] = {0: "batch_size", 1: "sequence_length"} + common_outputs["pooler_output"] = {0: "batch_size"} + else: + common_outputs = super().outputs + + return common_outputs + + base_task = "feature-extraction" + custom_task = f"{base_task}-with-pooler" + model_id = "sentence-transformers/all-MiniLM-L6-v2" + + config = AutoConfig.from_pretrained(model_id) + custom_export_configs = {"model": BertOnnxConfigWithPooler(config, task=custom_task)} + + with TemporaryDirectory() as tmpdirname: + main_export( + model_name_or_path=model_id, + custom_export_configs=custom_export_configs, + library_name="transformers", + output=Path(tmpdirname), + task=base_task, + ) + + ov_model = OVModelForCustomTasks.from_pretrained(tmpdirname) + + self.assertIsInstance(ov_model, OVBaseModel) + self.assertTrue(ov_model.output_names == {"last_hidden_state": 0, "pooler_output": 1}) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 46c6e3c69a..09fad5d773 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -26,6 +26,7 @@ from optimum.exporters.openvino.__main__ import main_export from optimum.intel import ( # noqa + OVLatentConsistencyModelPipeline, OVModelForAudioClassification, OVModelForCausalLM, OVModelForFeatureExtraction, @@ -65,7 +66,7 @@ class OVCLIExportTestCase(unittest.TestCase): ) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { "gpt2": 2, - "t5": 0, # failed internal sentencepiece check - no token in the vocab + "t5": 0, # no .model file in the repository "albert": 0, # not supported yet "distilbert": 1, # no detokenizer "roberta": 2, @@ -77,6 +78,12 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-xl": 0, # not supported } + SUPPORTED_SD_HYBRID_ARCHITECTURES = ( + ("stable-diffusion", 72, 195), + ("stable-diffusion-xl", 84, 331), + ("latent-consistency", 50, 135), + ) + SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),) SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"] @@ -118,26 +125,26 @@ def test_exporters_cli(self, task: str, model_type: str): for arch in SUPPORTED_ARCHITECTURES if not arch[0].endswith("-with-past") and not arch[1].endswith("-refiner") ) - @unittest.skipIf(not is_openvino_tokenizers_available(), reason="OpenVINO Tokenizers not available") def test_exporters_cli_tokenizers(self, task: str, model_type: str): with TemporaryDirectory() as tmpdir: output = subprocess.check_output( - f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --convert-tokenizer --task {task} {tmpdir}", + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} {tmpdir}", shell=True, stderr=subprocess.STDOUT, ).decode() - save_dir = Path(tmpdir) - number_of_tokenizers = sum("tokenizer" in file for file in map(str, save_dir.rglob("*.xml"))) - self.assertEqual( - self.EXPECTED_NUMBER_OF_TOKENIZER_MODELS[model_type], - number_of_tokenizers, - f"OVT: {is_openvino_tokenizers_available() }", - ) + if not is_openvino_tokenizers_available(): + self.assertTrue( + "OpenVINO Tokenizers is not available." in output + or "OpenVINO and OpenVINO Tokenizers versions are not binary compatible." in output, + msg=output, + ) + return + + number_of_tokenizers = sum("tokenizer" in file for file in map(str, Path(tmpdir).rglob("*.xml"))) + self.assertEqual(self.EXPECTED_NUMBER_OF_TOKENIZER_MODELS[model_type], number_of_tokenizers, output) if number_of_tokenizers == 1: self.assertTrue("Detokenizer is not supported, convert tokenizer only." in output, output) - elif number_of_tokenizers == 0 and task not in ("image-classification", "audio-classification"): - self.assertTrue(("OpenVINO Tokenizer export for" in output and "is not supported." in output), output) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_exporters_cli_fp16(self, task: str, model_type: str): @@ -176,6 +183,19 @@ def test_exporters_cli_int8(self, task: str, model_type: str): _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_int8[i], num_int8) + @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES) + def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: int, exp_num_int8: int): + with TemporaryDirectory() as tmpdir: + subprocess.run( + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --dataset laion/filtered-wit --weight-format int8 {tmpdir}", + shell=True, + check=True, + ) + model = eval(_HEAD_TO_AUTOMODELS[model_type]).from_pretrained(tmpdir) + num_fq, num_int8, _ = get_num_quantized_nodes(model.unet) + self.assertEqual(exp_num_int8, num_int8) + self.assertEqual(exp_num_fq, num_fq) + @parameterized.expand(TEST_4BIT_CONFIGURATONS) def test_exporters_cli_int4(self, task: str, model_type: str, option: str): with TemporaryDirectory() as tmpdir: @@ -198,3 +218,31 @@ def test_exporters_cli_help(self): shell=True, check=True, ) + + def test_exporters_cli_sentence_transformers(self): + model_id = MODEL_NAMES["bge"] + with TemporaryDirectory() as tmpdir: + # default export creates transformers model + subprocess.run( + f"optimum-cli export openvino --model {model_id} --task feature-extraction {tmpdir}", + shell=True, + check=True, + ) + model = eval(_HEAD_TO_AUTOMODELS["feature-extraction"]).from_pretrained(tmpdir, compile=False) + self.assertTrue("last_hidden_state" in model.output_names) + # export with transformers lib creates transformers model + subprocess.run( + f"optimum-cli export openvino --model {model_id} --task feature-extraction --library transformers {tmpdir}", + shell=True, + check=True, + ) + model = eval(_HEAD_TO_AUTOMODELS["feature-extraction"]).from_pretrained(tmpdir, compile=False) + self.assertTrue("last_hidden_state" in model.output_names) + # export with sentence_transformers lib creates sentence_transformers model + subprocess.run( + f"optimum-cli export openvino --model {model_id} --task feature-extraction --library sentence_transformers {tmpdir}", + shell=True, + check=True, + ) + model = eval(_HEAD_TO_AUTOMODELS["feature-extraction"]).from_pretrained(tmpdir, compile=False) + self.assertFalse("last_hidden_state" in model.output_names) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 65094ae221..f84cac8161 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -20,6 +20,7 @@ from typing import Dict import numpy as np +import pytest import requests import timm import torch @@ -53,6 +54,7 @@ set_seed, ) from transformers.onnx.utils import get_preprocessor +from transformers.testing_utils import slow from utils_tests import MODEL_NAMES from optimum.intel import ( @@ -61,6 +63,7 @@ OVModelForAudioXVector, OVModelForCausalLM, OVModelForCTC, + OVModelForCustomTasks, OVModelForFeatureExtraction, OVModelForImageClassification, OVModelForMaskedLM, @@ -364,6 +367,8 @@ def test_compare_to_transformers(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForQuestionAnswering.from_pretrained(model_id, export=True) @@ -379,6 +384,8 @@ def test_pipeline(self, model_arch): del model gc.collect() + @pytest.mark.run_slow + @slow def test_metric(self): model_id = "distilbert-base-cased-distilled-squad" set_seed(SEED) @@ -431,6 +438,8 @@ def test_compare_to_transformers(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForTokenClassification.from_pretrained(model_id, export=True) @@ -481,6 +490,8 @@ def test_compare_to_transformers(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForFeatureExtraction.from_pretrained(model_id, export=True) @@ -524,10 +535,12 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "stablelm", "starcoder2", "phi", + "internlm2", + "orion", + "falcon", ) GENERATION_LENGTH = 100 - IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3") - REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen") + REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "olmo", "orion") @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): @@ -551,37 +564,63 @@ def test_compare_to_transformers(self, model_arch): ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs) self.assertIsInstance(ov_model.config, PretrainedConfig) self.assertTrue(ov_model.use_cache) - self.assertEqual( - ov_model.stateful, self.IS_SUPPORT_STATEFUL and ov_model.config.model_type not in not_stateful - ) - set_seed(SEED) - transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) + self.assertEqual(ov_model.stateful, ov_model.config.model_type not in not_stateful) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) - if model_arch == "qwen": - transformers_model.to(torch.float32) - tokens = tokenizer( - "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None - ) - ov_outputs = ov_model(**tokens) + tokens = tokenizer("This is a sample output", return_tensors="pt") + ov_outputs = ov_model(**tokens) self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) self.assertTrue("past_key_values" in ov_outputs) self.assertIsInstance(ov_outputs.past_key_values, tuple) - is_stateful = ov_model.config.model_type not in not_stateful and self.IS_SUPPORT_STATEFUL + is_stateful = ov_model.config.model_type not in not_stateful self.assertEqual(ov_model.stateful, is_stateful) if is_stateful: self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) + + set_seed(SEED) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) + if model_arch == "qwen": + transformers_model.to(torch.float32) + with torch.no_grad(): transformers_outputs = transformers_model(**tokens) # Compare tensor outputs self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=1e-4)) + + # Qwen tokenizer does not support padding + if model_arch == "qwen": + return + + if model_arch != "chatglm": + tokenizer.pad_token_id = tokenizer.eos_token_id + # Compare batched generation + tokenizer.padding_side = "left" + tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + ov_model.generation_config.eos_token_id = None + transformers_model.generation_config.eos_token_id = None + ov_model.config.eos_token_id = None + transformers_model.config.eos_token_id = None + gen_config = GenerationConfig( + max_new_tokens=30, + min_new_tokens=30, + num_beams=3, + do_sample=False, + eos_token_id=None, + ) + + ov_outputs = ov_model.generate(**tokens, generation_config=gen_config) + transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) + self.assertTrue(torch.allclose(ov_outputs, transformers_outputs)) + del transformers_model del ov_model gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_pipeline(self, model_arch): model_kwargs = {} model_id = MODEL_NAMES[model_arch] @@ -611,30 +650,6 @@ def test_pipeline(self, model_arch): del model gc.collect() - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_multiple_inputs(self, model_arch): - model_id = MODEL_NAMES[model_arch] - set_seed(SEED) - if model_arch == "qwen": - self.skipTest("Qwen tokenizer does not support padding") - model_kwargs = {} - if model_arch in self.REMOTE_CODE_MODELS: - model_kwargs = { - "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), - "trust_remote_code": True, - } - model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, **model_kwargs) - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) - tokenizer.pad_token = tokenizer.eos_token - texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"] - tokens = tokenizer(texts, padding=True, return_tensors="pt") - generation_config = GenerationConfig(encoder_no_repeat_ngram_size=0, max_new_tokens=20, num_beams=2) - outputs = model.generate(**tokens, generation_config=generation_config) - self.assertIsInstance(outputs, torch.Tensor) - self.assertEqual(outputs.shape[0], 3) - del model - gc.collect() - def test_model_and_decoder_same_device(self): model_id = MODEL_NAMES["gpt2"] model = OVModelForCausalLM.from_pretrained(model_id, export=True) @@ -660,12 +675,11 @@ def test_compare_with_and_without_past_key_values(self): self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) - if self.IS_SUPPORT_STATEFUL: - model_stateful = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=True) - outputs_model_stateful = model_stateful.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - self.assertTrue(torch.equal(outputs_model_without_pkv, outputs_model_stateful)) + model_stateful = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=True) + outputs_model_stateful = model_stateful.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) + self.assertTrue(torch.equal(outputs_model_without_pkv, outputs_model_stateful)) del model_with_pkv del model_without_pkv @@ -844,6 +858,8 @@ def test_compare_to_transformers(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForImageClassification.from_pretrained(model_id, export=True) @@ -974,6 +990,8 @@ def test_pipeline(self, model_arch): gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_generate_utils(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True) @@ -1431,6 +1449,8 @@ def test_load_vanilla_transformers_which_is_not_supported(self): self.assertIn("only supports the tasks", str(context.exception)) @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow def test_generate_utils(self, model_arch: str): model_id = MODEL_NAMES[model_arch] model = OVModelForVision2Seq.from_pretrained(model_id, export=True) @@ -1506,3 +1526,87 @@ def test_pipeline_image_to_text(self, model_arch: str): self.assertIsInstance(outputs[0]["generated_text"], str) gc.collect() + + +class OVModelForCustomTasksIntegrationTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES_WITH_ATTENTION = ["vit-with-attentions"] + SUPPORTED_ARCHITECTURES_WITH_HIDDEN_STATES = ["vit-with-hidden-states"] + + def _get_sample_image(self): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + return image + + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_ATTENTION) + def test_compare_output_attentions(self, model_arch): + model_id = MODEL_NAMES[model_arch] + + image = self._get_sample_image() + preprocessor = AutoFeatureExtractor.from_pretrained(model_id) + inputs = preprocessor(images=image, return_tensors="pt") + + transformers_model = AutoModelForImageClassification.from_pretrained(model_id) + transformers_model.eval() + with torch.no_grad(): + transformers_outputs = transformers_model(**inputs, output_attentions=True) + + ov_model = OVModelForCustomTasks.from_pretrained(model_id, ov_config=F32_CONFIG) + self.assertIsInstance(ov_model.config, PretrainedConfig) + + for input_type in ["pt", "np"]: + inputs = preprocessor(images=image, return_tensors=input_type) + ov_outputs = ov_model(**inputs) + self.assertIn("logits", ov_outputs) + self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) + self.assertTrue(len(ov_outputs.attentions) == len(transformers_outputs.attentions)) + for i in range(len(ov_outputs.attentions)): + self.assertTrue( + torch.allclose( + torch.Tensor(ov_outputs.attentions[i]), + transformers_outputs.attentions[i], + atol=1e-4, # attentions are accurate + rtol=1e-4, # attentions are accurate + ), + f"Attention mismatch at layer {i}", + ) + + del transformers_model + del ov_model + gc.collect() + + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HIDDEN_STATES) + def test_compare_output_hidden_states(self, model_arch): + model_id = MODEL_NAMES[model_arch] + + image = self._get_sample_image() + preprocessor = AutoFeatureExtractor.from_pretrained(model_id) + inputs = preprocessor(images=image, return_tensors="pt") + + transformers_model = AutoModelForImageClassification.from_pretrained(model_id) + transformers_model.eval() + with torch.no_grad(): + transformers_outputs = transformers_model(**inputs, output_hidden_states=True) + + ov_model = OVModelForCustomTasks.from_pretrained(model_id, ov_config=F32_CONFIG) + self.assertIsInstance(ov_model.config, PretrainedConfig) + for input_type in ["pt", "np"]: + inputs = preprocessor(images=image, return_tensors=input_type) + ov_outputs = ov_model(**inputs) + self.assertIn("logits", ov_outputs) + self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) + self.assertTrue(len(ov_outputs.hidden_states) == len(transformers_outputs.hidden_states)) + for i in range(len(ov_outputs.hidden_states)): + self.assertTrue( + torch.allclose( + torch.Tensor(ov_outputs.hidden_states[i]), + transformers_outputs.hidden_states[i], + atol=1e-3, # hidden states are less accurate + rtol=1e-2, # hidden states are less accurate + ), + f"Hidden states mismatch at layer {i}", + ) + del transformers_model + del ov_model + gc.collect() diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 0e307fb036..e269578c35 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -15,15 +15,19 @@ # ruff: noqa import itertools +import logging import tempfile import unittest from collections import defaultdict +from enum import Enum from functools import partial +from typing import List, Union import evaluate import numpy as np import torch from datasets import load_dataset +from nncf.quantization.advanced_parameters import OverflowFix from parameterized import parameterized import openvino.runtime as ov import nncf @@ -37,6 +41,7 @@ TrainingArguments, default_data_collator, ) +from transformers.utils.quantization_config import QuantizationMethod from optimum.intel import ( OVConfig, @@ -55,8 +60,10 @@ OVStableDiffusionXLPipeline, OVQuantizer, OVTrainer, + OVQuantizationConfig, OVWeightQuantizationConfig, ) +from optimum.intel.openvino.configuration import OVQuantizationMethod, OVQuantizationConfigBase from optimum.intel.openvino.quantization import InferRequestWrapper from optimum.intel.utils.import_utils import is_openvino_version @@ -98,7 +105,13 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="train", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, file_name=file_name) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize( + save_directory=tmp_dir, + calibration_dataset=calibration_dataset, + file_name=file_name, + ov_config=ov_config, + ) model = model_cls.from_pretrained(tmp_dir, file_name=file_name) # TODO: uncomment once move to a newer version of NNCF which has some fixes (addmm, baddmm) @@ -110,6 +123,10 @@ def preprocess_function(examples, tokenizer): outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): task = model_cls.export_feature @@ -134,7 +151,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="train", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) model = model_cls.from_pretrained(tmp_dir) @@ -146,6 +164,10 @@ def preprocess_function(examples, tokenizer): outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + class OVWeightCompressionTest(unittest.TestCase): # TODO : add models @@ -210,7 +232,7 @@ class OVWeightCompressionTest(unittest.TestCase): ratio=0.8, sensitivity_metric="mean_activation_magnitude", dataset="ptb", - awq=True, + quant_method=QuantizationMethod.AWQ, ), 14, ), @@ -251,7 +273,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -261,6 +283,15 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + original_config_as_dict = OVWeightQuantizationConfig().to_dict() + for k in original_config_as_dict.keys(): + v = original_config_as_dict[k] + if isinstance(v, Enum): + original_config_as_dict[k] = v.value + self.assertEqual(original_config_as_dict, loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature @@ -272,7 +303,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -282,6 +313,10 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS) def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8, expected_int4): task = model_cls.export_feature @@ -297,7 +332,6 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i ov_config = OVConfig(quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, ratio=0.8)) quantizer.quantize( save_directory=tmp_dir, - weights_only=True, ov_config=ov_config, ) model = model_cls.from_pretrained(tmp_dir) @@ -310,6 +344,10 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above") def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, expected_pt_int8, expected_ov_int8): @@ -322,7 +360,7 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp tokenizer.pad_token = tokenizer.eos_token quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) - quantizer.quantize(save_directory=tmp_dir, weights_only=True) + quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) _, num_int8, _ = get_num_quantized_nodes(model) @@ -332,6 +370,10 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_id, exp outputs = model(**tokens) self.assertTrue("logits" in outputs) + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False) @@ -401,17 +443,18 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.quantization_config.bits, 4) self.assertEqual(openvino_config.dtype, "int4") if model_id == "facebook/opt-125m": for key, value in self.DEFAULT_INT4_CONFIG.items(): - self.assertEqual(value, openvino_config.quantization_config[key]) + self.assertEqual(value, getattr(openvino_config.quantization_config, key)) @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( self, model_cls, model_id, quantization_config, expected_ov_int4 ): with tempfile.TemporaryDirectory() as tmp_dir: + quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config) model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -422,7 +465,7 @@ def test_ovmodel_4bit_auto_compression_with_config( model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) - self.assertEqual(openvino_config.quantization_config["bits"], 4) + self.assertEqual(openvino_config.quantization_config.bits, 4) self.assertEqual(openvino_config.dtype, "int4") @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS) @@ -453,9 +496,8 @@ def transform_fn(data, tokenizer): model = model_cls.from_pretrained( model_id, export=True, - quantization_config=OVWeightQuantizationConfig( - bits=4, sym=True, group_size=-1, ratio=0.8, dataset=quantization_dataset - ), + quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8), + calibration_dataset=quantization_dataset, ) _, num_int8, num_int4 = get_num_quantized_nodes(model) @@ -545,7 +587,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): "all_layers": None, "sensitivity_metric": None, "dataset": None, - "ignored_scope": None, + "ignored_scope": nncf.IgnoredScope(), } compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) @@ -571,7 +613,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="test", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir) @@ -586,6 +629,10 @@ def preprocess_function(examples, tokenizer): except RuntimeError: self.fail("Loading BERT QA model a second time failed") + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_ovmodel_static_quantization(self, model_name): def preprocess_function(examples, tokenizer): @@ -604,7 +651,8 @@ def preprocess_function(examples, tokenizer): num_samples=10, dataset_split="test", ) - quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset) + ov_config = OVConfig(quantization_config=OVQuantizationConfig()) + quantizer.quantize(save_directory=tmp_dir, calibration_dataset=calibration_dataset, ov_config=ov_config) # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir) @@ -619,6 +667,10 @@ def preprocess_function(examples, tokenizer): except RuntimeError: self.fail("Loading BERT QA model a second time failed") + # Verify that the configuration is correctly saved and loaded + loaded_config = OVConfig.from_pretrained(tmp_dir) + self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + class OVTrainerTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 50, 38),) @@ -666,6 +718,132 @@ def compute_metrics(p): self.assertTrue("logits" in outputs) +class OVQuantizationConfigTest(unittest.TestCase): + QUANTIZATION_CONFIGS = ( + (None,), + (OVWeightQuantizationConfig(),), + ( + OVWeightQuantizationConfig( + bits=8, + sym=True, + ), + ), + ( + OVWeightQuantizationConfig( + dataset="wikitext", + bits=4, + ignored_scope={"names": ["op_name"]}, + sym=False, + tokenizer="dbmdz/bert-base-german-cased", + ratio=1.0, + group_size=128, + all_layers=True, + sensitivity_metric="mean_activation_magnitude", + num_samples=100, + quant_method=OVQuantizationMethod.DEFAULT, + ), + ), + (OVWeightQuantizationConfig(dataset=["hello world", "i'm alive"]),), + ( + OVQuantizationConfig( + ignored_scope={"names": ["op_name"]}, + num_samples=100, + sym=False, + model_type="transformer", + fast_bias_correction=True, + overflow_fix="disable", + ), + ), + (OVQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])),), + ) + + QUANTIZATION_CONFIG_DICTS = ( + (dict(bits=8, sym=True), OVWeightQuantizationConfig, None), + ( + dict( + dataset="wikitext", + bits=4, + ignored_scope={"names": ["op_name"]}, + sym=False, + tokenizer="dbmdz/bert-base-german-cased", + ratio=1.0, + group_size=128, + all_layers=True, + sensitivity_metric="mean_activation_magnitude", + num_samples=100, + quant_method=OVQuantizationMethod.DEFAULT, + ), + OVWeightQuantizationConfig, + None, + ), + (dict(), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + ( + dict(ignored_scope={"names": ["op_name"]}), + OVWeightQuantizationConfig, + "Can't determine type of OV quantization config", + ), + (dict(num_samples=100), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + (dict(abc="def"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), + ( + dict(bits=8, fast_bias_correction=True), + OVWeightQuantizationConfig, + "Can't determine type of OV quantization config", + ), + (dict(model_type="transformer"), OVQuantizationConfig, None), + ( + dict( + ignored_scope={"names": ["op_name"]}, + num_samples=100, + sym=False, + model_type="transformer", + fast_bias_correction=True, + overflow_fix="disable", + ), + OVQuantizationConfig, + None, + ), + (dict(weight_only=True), OVWeightQuantizationConfig, None), + (dict(weight_only=False), OVQuantizationConfig, None), + (dict(abc="def", weight_only=False), OVQuantizationConfig, None), + (dict(abc="def", weight_only=True), OVWeightQuantizationConfig, None), + (dict(bits=8, fast_bias_correction=True, weight_only=True), OVWeightQuantizationConfig, None), + (dict(bits=8, fast_bias_correction=True, weight_only=False), OVQuantizationConfig, None), + (dict(bits=8, sym=True, weight_only=False), OVWeightQuantizationConfig, "Please check your configuration"), + (dict(model_type="transformer", weight_only=True), OVQuantizationConfig, "Please check your configuration"), + ) + + @parameterized.expand(QUANTIZATION_CONFIGS) + def test_config_serialization(self, quantization_config: OVQuantizationConfigBase): + ov_config = OVConfig(quantization_config=quantization_config) + with tempfile.TemporaryDirectory() as tmp_dir: + ov_config.save_pretrained(tmp_dir) + loaded_ov_config = OVConfig.from_pretrained(tmp_dir) + + if quantization_config is None: + self.assertEqual(loaded_ov_config.quantization_config, None) + return + for key, value in loaded_ov_config.quantization_config.to_dict().items(): + initial_value = getattr(ov_config.quantization_config, key) + self.assertEqual(value, initial_value) + + @parameterized.expand(QUANTIZATION_CONFIG_DICTS) + def test_config_from_dict(self, quantization_config: dict, config_type: type, warning_log: Union[str, None]): + from optimum.intel.openvino.configuration import logger as configuration_logger + + if warning_log is not None: + with self.assertLogs(configuration_logger, logging.WARN) as cm: + ov_config = OVConfig(quantization_config=quantization_config) + self.assertTrue(any(warning_log in log for log in cm.output)) + else: + ov_config = OVConfig(quantization_config=quantization_config) + self.assertIsInstance(ov_config.quantization_config, config_type) + for k, v in quantization_config.items(): + if k == "weight_only" and warning_log == "Please check your configuration": + continue + if hasattr(ov_config.quantization_config, k): + self.assertEqual(getattr(ov_config.quantization_config, k), v) + + class InferRequestWrapperTest(unittest.TestCase): MODEL_ID = ("openai/whisper-tiny.en",) APPLY_CACHING = (False, True) diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py index 80298faf2b..db443c6de2 100644 --- a/tests/openvino/test_training.py +++ b/tests/openvino/test_training.py @@ -45,14 +45,14 @@ from transformers.utils import WEIGHTS_NAME from optimum.intel.openvino import OVTrainingArguments -from optimum.intel.openvino.configuration import DEFAULT_QUANTIZATION_CONFIG, OVConfig +from optimum.intel.openvino.configuration import OVConfig from optimum.intel.openvino.modeling import ( OVModel, OVModelForAudioClassification, OVModelForImageClassification, OVModelForSequenceClassification, ) -from optimum.intel.openvino.trainer import OVTrainer +from optimum.intel.openvino.trainer import DEFAULT_QUANTIZATION_CONFIG, OVTrainer from optimum.intel.openvino.utils import OV_XML_FILE_NAME diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index c95444274e..ca56f6d552 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -19,6 +19,7 @@ MODEL_NAMES = { "albert": "hf-internal-testing/tiny-random-albert", "audio_spectrogram_transformer": "Ericwang/tiny-random-ast", + "bge": "BAAI/bge-small-en-v1.5", "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", "bert": "hf-internal-testing/tiny-random-bert", "bart": "hf-internal-testing/tiny-random-bart", @@ -42,6 +43,7 @@ "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "electra": "hf-internal-testing/tiny-random-electra", "gemma": "fxmarty/tiny-random-GemmaForCausalLM", + "falcon": "fxmarty/really-tiny-falcon-testing", "flaubert": "hf-internal-testing/tiny-random-flaubert", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", @@ -50,6 +52,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJModel", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-ibert", + "internlm2": "katuni4ka/tiny-random-internlm2", "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-longt5", "llama": "fxmarty/tiny-llama-fast-tokenizer", @@ -69,6 +72,8 @@ "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", "mt5": "stas/mt5-tiny-random", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", + "olmo": "katuni4ka/tiny-random-olmo", + "orion": "katuni4ka/tiny-random-orion", "pegasus": "hf-internal-testing/tiny-random-pegasus", "pix2struct": "fxmarty/pix2struct-tiny-random", "phi": "echarlaix/tiny-random-PhiForCausalLM", @@ -96,6 +101,8 @@ "unispeech": "hf-internal-testing/tiny-random-unispeech", "unispeech_sat": "hf-internal-testing/tiny-random-UnispeechSatModel", "vit": "hf-internal-testing/tiny-random-vit", + "vit-with-attentions": "IlyasMoutawwakil/vit-with-attentions", + "vit-with-hidden-states": "IlyasMoutawwakil/vit-with-hidden_states", "vision-encoder-decoder": "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2", "wavlm": "hf-internal-testing/tiny-random-WavlmModel", "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier",