Skip to content

Commit

Permalink
Merge branch 'huggingface:main' into ipex_example
Browse files Browse the repository at this point in the history
  • Loading branch information
jiqing-feng authored Mar 20, 2024
2 parents 4f02485 + d2f9fdb commit 70874f4
Show file tree
Hide file tree
Showing 37 changed files with 3,055 additions and 1,624 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/test_openvino.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,9 @@ jobs:
- name: Test with Pytest
run: |
pytest tests/openvino/ --ignore test_modeling_basic
- name: Test openvino-nightly
run: |
pip uninstall -y openvino
pip install openvino-nightly
python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov
46 changes: 46 additions & 0 deletions .github/workflows/test_openvino_examples.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: OpenVINO - Examples Test

on:
workflow_dispatch:
schedule:
- cron: 0 1 * * 1 # run weekly: every Monday at 1am
push:
paths:
- '.github/workflows/test_openvino_examples.yml'
- 'examples/openvino/*'
pull_request:
paths:
- '.github/workflows/test_openvino_examples.yml'
- 'examples/openvino/*'

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
build:
strategy:
fail-fast: false
matrix:
python-version: ["3.8", "3.10"]

runs-on: ubuntu-20.04

steps:
- uses: actions/checkout@v2
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
pip install optimum[openvino] jstyleson nncf pytest
pip install -r examples/openvino/audio-classification/requirements.txt
pip install -r examples/openvino/image-classification/requirements.txt
pip install -r examples/openvino/question-answering/requirements.txt
pip install -r examples/openvino/text-classification/requirements.txt
- name: Test examples
run: |
python -m pytest examples/openvino/test_examples.py
2 changes: 2 additions & 0 deletions .github/workflows/test_openvino_notebooks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,7 @@ jobs:

- name: Test with Pytest
run: |
sed -i 's/NUM_TRAIN_ITEMS = 600/NUM_TRAIN_ITEMS = 10/' notebooks/openvino/question_answering_quantization.ipynb
sed -i 's/# %pip install/%pip install/' notebooks/openvino/optimum_openvino_inference.ipynb
python -m pytest --nbval-lax notebooks/openvino/optimum_openvino_inference.ipynb notebooks/openvino/question_answering_quantization.ipynb
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
# Run code quality checks
style_check:
black --check .
ruff .
ruff check .

style:
black .
ruff . --fix
ruff check . --fix

# Run tests for the library
test:
Expand Down
11 changes: 6 additions & 5 deletions docs/source/inference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -99,21 +99,22 @@ tokenizer.save_pretrained(save_directory)

### Weight-only quantization

You can also apply 8-bit or 4-bit weight quantization when exporting your model with the CLI by setting the `weight-format` argument to respectively `int8` or `int4`:
You can also apply fp16, 8-bit or 4-bit weight compression on the Linear, Convolutional and Embedding layers when exporting your model with the CLI by setting `--weight-format` to respectively `fp16`, `int8` or `int4`:

```bash
optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
```

This will result in the exported model linear and embedding layers to be quantized to INT8 or INT4, the activations will be kept in floating point precision. This type of optimization allows reducing the footprint and latency of LLMs.
This type of optimization allows to reduce the memory footprint and inference latency.

By default the quantization scheme will be [assymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `--sym`.

By default the quantization scheme will be [asymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `--sym`.

For INT4 quantization you can also specify the following arguments :
* The `--group-size` parameter will define the group size to use for quantization, `-1` it will results in per-column quantization.
* The `--ratio` parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`.

Smaller `group_size` and `ratio` of usually improve accuracy at the sacrifice of the model size and inference latency.
Smaller `group_size` and `ratio` values usually improve accuracy at the sacrifice of the model size and inference latency.

You can also apply 8-bit quantization on your model's weight when loading your model by setting the `load_in_8bit=True` argument when calling the `from_pretrained()` method.

Expand All @@ -125,7 +126,7 @@ model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)

<Tip warning={true}>

`load_in_8bit` is enabled by default for the models larger than 1 billion parameters.
`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. You can disable it with `load_in_8bit=False`.

</Tip>

Expand Down
132 changes: 90 additions & 42 deletions docs/source/optimization_ov.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,95 @@ limitations under the License.
🤗 Optimum Intel provides an `openvino` package that enables you to apply a variety of model compression methods such as quantization, pruning, on many models hosted on the 🤗 hub using the [NNCF](https://docs.openvino.ai/2022.1/docs_nncf_introduction.html) framework.


## Post-training optimization
## Post-training

Post-training static quantization introduces an additional calibration step where data is fed through the network in order to compute the activations quantization parameters.
Here is how to apply static quantization on a fine-tuned DistilBERT:
Quantization is a technique to reduce the computational and memory costs of running inference by representing the weights and / or the activations with lower precision data types like 8-bit or 4-bit.

### Weight-only quantization

Quantization can be applied on the model's Linear, Convolutional and Embedding layers, enabling the loading of large models on memory-limited devices. For example, when applying 8-bit quantization, the resulting model will be x4 smaller than its fp32 counterpart. For 4-bit quantization, the reduction in memory could theoretically reach x8, but is closer to x6 in practice.


#### 8-bit

For the 8-bit weight quantization you can set `load_in_8bit=True` to load your model's weights in 8-bit:

```python
from functools import partial
from transformers import AutoTokenizer
from optimum.intel import OVConfig, OVQuantizer, OVModelForSequenceClassification,
from optimum.intel import OVModelForCausalLM

model_id = "helenai/gpt2-ov"
model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)

# Saves the int8 model that will be x4 smaller than its fp32 counterpart
model.save_pretrained(saving_directory)
```

<Tip warning={true}>

`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. You can disable it with `load_in_8bit=False`.

</Tip>

You can also provide a `quantization_config` instead to specify additional optimization parameters.

#### 4-bit

For the 4-bit weight quantization, you need a `quantization_config` to define the optimization parameters, for example:

```python
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig

quantization_config = OVWeightQuantizationConfig(bits=4)
model = OVModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
```

You can tune quantization parameters to achieve a better performance accuracy trade-off as follows:

```python
quantization_config = OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, dataset="ptb")
```

By default the quantization scheme will be [asymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) you can add `sym=True`.

For 4-bit quantization you can also specify the following arguments in the quantization configuration :
* The `group_size` parameter will define the group size to use for quantization, `-1` it will results in per-column quantization.
* The `ratio` parameter controls the ratio between 4-bit and 8-bit quantization. If set to 0.9, it means that 90% of the layers will be quantized to `int4` while 10% will be quantized to `int8`.

Smaller `group_size` and `ratio` values usually improve accuracy at the sacrifice of the model size and inference latency.

### Static quantization

When applying post-training static quantization, both the weights and the activations are quantized.
To apply quantization on the activations, an additional calibration step is needed which consists in feeding a `calibration_dataset` to the network in order to estimate the quantization activations parameters.

Here is how to apply static quantization on a fine-tuned DistilBERT given your own `calibration_dataset`:

```python
from transformers import AutoTokenizer
from optimum.intel import OVQuantizer, OVModelForSequenceClassification,

model_id = "distilbert-base-uncased-finetuned-sst-2-english"
model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# The directory where the quantized model will be saved
save_dir = "ptq_model"

quantizer = OVQuantizer.from_pretrained(model)

# Apply static quantization and export the resulting quantized model to OpenVINO IR format
quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
# Save the tokenizer
tokenizer.save_pretrained(save_dir)
```

The calibration dataset can also be created easily using your `OVQuantizer`:

```python
from functools import partial

def preprocess_function(examples, tokenizer):
return tokenizer(examples["sentence"], padding="max_length", max_length=128, truncation=True)

# Instantiate our OVQuantizer using the desired configuration
quantizer = OVQuantizer.from_pretrained(model)
# Create the calibration dataset used to perform static quantization
calibration_dataset = quantizer.get_calibration_dataset(
"glue",
Expand All @@ -48,59 +116,39 @@ calibration_dataset = quantizer.get_calibration_dataset(
num_samples=300,
dataset_split="train",
)
# Apply static quantization and export the resulting quantized model to OpenVINO IR format
quantizer.quantize(
calibration_dataset=calibration_dataset,
save_directory=save_dir,
)
# Save the tokenizer
tokenizer.save_pretrained(save_dir)
```

The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device.

## Weight-only quantization
The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device.

You can optimize the performance of text-generation LLMs by quantizing weights to various precisions that provide different performance-accuracy trade-offs.

```python
from optimum.intel import OVModelForCausalLM
### Hybrid quantization

model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
```
Traditional optimization methods like post-training 8-bit quantization do not work well for Stable Diffusion (SD) models and can lead to poor generation results. On the other hand, weight compression does not improve performance significantly when applied to Stable Diffusion models, as the size of activations is comparable to weights.
The U-Net component takes up most of the overall execution time of the pipeline. Thus, optimizing just this one component can bring substantial benefits in terms of inference speed while keeping acceptable accuracy without fine-tuning. Quantizing the rest of the diffusion pipeline does not significantly improve inference performance but could potentially lead to substantial accuracy degradation.
Therefore, the proposal is to apply quantization in *hybrid mode* for the U-Net model and weight-only quantization for the rest of the pipeline components :
* U-Net : quantization applied on both the weights and activations
* The text encoder, VAE encoder / decoder : quantization applied on the weights

<Tip warning={true}>

`load_in_8bit` is enabled by default for the models larger than 1 billion parameters.

</Tip>
The hybrid mode involves the quantization of weights in MatMul and Embedding layers, and activations of other layers, facilitating accuracy preservation post-optimization while reducing the model size.

For the 4-bit weight quantization you can use the `quantization_config` to specify the optimization parameters, for example:
The `quantization_config` is utilized to define optimization parameters for optimizing the SD pipeline. To enable hybrid quantization, specify the quantization dataset in the `quantization_config`. If the dataset is not defined, weight-only quantization will be applied on all components.

```python
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
from optimum.intel import OVStableDiffusionPipeline, OVWeightQuantizationConfig

model = OVModelForCausalLM.from_pretrained(
model = OVStableDiffusionPipeline.from_pretrained(
model_id,
quantization_config=OVWeightQuantizationConfig(bits=4),
export=True,
quantization_config=OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions"),
)
```

You can tune quantization parameters to achieve a better performance accuracy trade-off as follows:

```python
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig

model = OVModelForCausalLM.from_pretrained(
model_id,
quantization_config=OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, dataset="ptb"),
)
```

For more details, please refer to the corresponding NNCF [documentation](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/CompressWeights.md).


## Training-time optimization
## Training-time

Apart from optimizing a model after training like post-training quantization above, `optimum.openvino` also provides optimization methods during training, namely Quantization-Aware Training (QAT) and Joint Pruning, Quantization and Distillation (JPQD).

Expand Down
18 changes: 8 additions & 10 deletions examples/openvino/image-classification/run_image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,12 @@ class ModelArguments:
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
use_auth_token: bool = field(
default=False,
token: str = field(
default=None,
metadata={
"help": (
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
"with private models)."
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
)
},
)
Expand Down Expand Up @@ -239,8 +239,7 @@ def main():
data_args.dataset_name,
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
task="image-classification",
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
)
else:
data_files = {}
Expand All @@ -252,7 +251,6 @@ def main():
"imagefolder",
data_files=data_files,
cache_dir=model_args.cache_dir,
task="image-classification",
)

# If we don't have a validation split, split off a percentage of train as validation.
Expand Down Expand Up @@ -287,15 +285,15 @@ def compute_metrics(p):
finetuning_task="image-classification",
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
)
model = AutoModelForImageClassification.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
)

Expand All @@ -311,7 +309,7 @@ def compute_metrics(p):
model_args.feature_extractor_name or model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
token=model_args.token,
)

# Define torchvision transforms to be applied to each image.
Expand Down
Loading

0 comments on commit 70874f4

Please sign in to comment.