Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support OpenVINO int8 static quantization #3025

Merged
merged 12 commits into from
Nov 1, 2024
2 changes: 1 addition & 1 deletion docs/package_reference/util.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
## Model Optimization
```eval_rst
.. automodule:: sentence_transformers.backend
:members: export_optimized_onnx_model, export_dynamic_quantized_onnx_model
:members: export_optimized_onnx_model, export_dynamic_quantized_onnx_model, export_static_quantized_openvino_model
```

## Similarity Metrics
Expand Down
71 changes: 71 additions & 0 deletions docs/sentence_transformer/usage/efficiency.rst
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,77 @@ To convert a model to OpenVINO format, you can use the following code:
model = SentenceTransformer("intfloat/multilingual-e5-small", backend="openvino")
model.push_to_hub("intfloat/multilingual-e5-small", create_pr=True)

Quantizing OpenVINO Models
^^^^^^^^^^^^^^^^^^^^^^

OpenVINO models can be quantized to int8 precision using Optimum Intel to speed up inference.
To do this, you can use the :func:`~sentence_transformers.backend.export_static_quantized_openvino_model` function,
which saves the quantized model in a directory or model repository that you specify.
Post-Training Static Quantization expects:

- ``model``: a Sentence Transformer model loaded with the OpenVINO backend.
- ``quantization_config``: a quantization configuration from :class:`~optimum.intel.OVQuantizationConfig` instance.
- ``model_name_or_path``: a path to save the quantized model file, or the repository name if you want to push it to the Hugging Face Hub.
- ``push_to_hub``: (Optional) a boolean to push the quantized model to the Hugging Face Hub.
- ``create_pr``: (Optional) a boolean to create a pull request when pushing to the Hugging Face Hub. Useful when you don't have write access to the repository.
- ``file_suffix``: (Optional) a string to append to the model name when saving it. If not specified, ``"qint8_quantized"`` will be used.

See this example for quantizing a model to ``int8`` with :doc:`static quantization <optimum-intel:openvino/optimization.mdx#static-quantization>`:

.. tab:: Hugging Face Hub Model

Only quantize once::

from sentence_transformers import SentenceTransformer, export_static_quantized_openvino_model
from optimum.intel import OVQuantizationConfig

model = SentenceTransformer("all-MiniLM-L6-v2", backend="openvino")
quantization_config = OVQuantizationConfig()
export_static_quantized_openvino_model(model, quantization_config, "all-MiniLM-L6-v2", push_to_hub=True, create_pr=True)

Before the pull request gets merged::

from sentence_transformers import SentenceTransformer

pull_request_nr = 2 # TODO: Update this to the number of your pull request
model = SentenceTransformer(
"all-MiniLM-L6-v2",
backend="openvino",
model_kwargs={"file_name": "openvino/openvino_model_qint8_quantized.xml"},
revision=f"refs/pr/{pull_request_nr}"
)

Once the pull request gets merged::

from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
"all-MiniLM-L6-v2",
backend="openvino",
model_kwargs={"file_name": "openvino/openvino_model_qint8_quantized.xml"},
)

.. tab:: Local Model

Only quantize once::

from sentence_transformers import SentenceTransformer, export_static_quantized_openvino_model
from optimum.intel import OVQuantizationConfig

model = SentenceTransformer("path/to/my/mpnet-legal-finetuned", backend="openvino")
quantization_config = OVQuantizationConfig()
export_static_quantized_openvino_model(model, quantization_config, "path/to/my/mpnet-legal-finetuned")

After quantizing::

from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
"path/to/my/mpnet-legal-finetuned",
backend="openvino",
model_kwargs={"file_name": "openvino/openvino_model_qint8_quantized.xml"},
)

Benchmarks
----------

Expand Down
2 changes: 2 additions & 0 deletions sentence_transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os

from sentence_transformers.backend import export_dynamic_quantized_onnx_model, export_optimized_onnx_model
from sentence_transformers.backend import export_static_quantized_openvino_model
from sentence_transformers.cross_encoder.CrossEncoder import CrossEncoder
from sentence_transformers.datasets import ParallelSentencesDataset, SentencesDataset
from sentence_transformers.LoggingHandler import LoggingHandler
Expand Down Expand Up @@ -37,4 +38,5 @@
"quantize_embeddings",
"export_optimized_onnx_model",
"export_dynamic_quantized_onnx_model",
"export_static_quantized_openvino_model",
]
133 changes: 119 additions & 14 deletions sentence_transformers/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

try:
from optimum.onnxruntime.configuration import OptimizationConfig, QuantizationConfig
from optimum.intel import OVQuantizationConfig
except ImportError:
pass

Expand Down Expand Up @@ -97,14 +98,15 @@ def export_optimized_onnx_model(
if file_suffix is None:
file_suffix = "optimized"

save_or_push_to_hub_onnx_model(
save_or_push_to_hub_model(
export_function=lambda save_dir: optimizer.optimize(optimization_config, save_dir, file_suffix=file_suffix),
export_function_name="export_optimized_onnx_model",
config=optimization_config,
model_name_or_path=model_name_or_path,
push_to_hub=push_to_hub,
create_pr=create_pr,
file_suffix=file_suffix,
backend="onnx",
)


Expand Down Expand Up @@ -180,32 +182,126 @@ def export_dynamic_quantized_onnx_model(
if file_suffix is None:
file_suffix = f"{quantization_config.weights_dtype.name.lower()}_quantized"

save_or_push_to_hub_onnx_model(
save_or_push_to_hub_model(
export_function=lambda save_dir: quantizer.quantize(quantization_config, save_dir, file_suffix=file_suffix),
export_function_name="export_dynamic_quantized_onnx_model",
config=quantization_config,
model_name_or_path=model_name_or_path,
push_to_hub=push_to_hub,
create_pr=create_pr,
file_suffix=file_suffix,
backend="onnx",
)


def save_or_push_to_hub_onnx_model(
def export_static_quantized_openvino_model(
model: SentenceTransformer,
quantization_config: OVQuantizationConfig,
tomaarsen marked this conversation as resolved.
Show resolved Hide resolved
model_name_or_path: str,
push_to_hub: bool = False,
create_pr: bool = False,
file_suffix: str = "qint8_quantized",
tomaarsen marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""
Export a quantized OpenVINO model from a SentenceTransformer model.

This function applies Post-Training Static Quantization (PTQ) using a calibration dataset, which calibrates
quantization constants without requiring model retraining. Each default quantization configuration converts
the model to int8 precision, enabling faster inference while maintaining accuracy.

See https://sbert.net/docs/sentence_transformer/usage/efficiency.html for more information & benchmarks.

Args:
model (SentenceTransformer): The SentenceTransformer model to be quantized. Must be loaded with `backend="openvino"`.
quantization_config (OVQuantizationConfig): The quantization configuration.
model_name_or_path (str): The path or Hugging Face Hub repository name where the quantized model will be saved.
push_to_hub (bool, optional): Whether to push the quantized model to the Hugging Face Hub. Defaults to False.
create_pr (bool, optional): Whether to create a pull request when pushing to the Hugging Face Hub. Defaults to False.
file_suffix (str, optional): The suffix to add to the quantized model file name. Defaults to `qint8_quantized`.

Raises:
ImportError: If the required packages `optimum` and `openvino` are not installed.
ValueError: If the provided model is not a valid SentenceTransformer model loaded with `backend="openvino"`.
ValueError: If the provided quantization_config is not valid.

Returns:
None
"""
from sentence_transformers import SentenceTransformer
from sentence_transformers.models.Transformer import Transformer

try:
from optimum.intel import OVModelForFeatureExtraction, OVQuantizer, OVConfig
except ImportError:
raise ImportError(
"Please install Optimum and OpenVINO to use this function. "
"You can install them with pip: `pip install optimum[openvino]`"
)

if (
not isinstance(model, SentenceTransformer)
or not len(model)
or not isinstance(model[0], Transformer)
or not isinstance(model[0].auto_model, OVModelForFeatureExtraction)
):
raise ValueError(
'The model must be a Transformer-based SentenceTransformer model loaded with `backend="openvino"`.'
)

ov_model: OVModelForFeatureExtraction = model[0].auto_model
ov_config = OVConfig(quantization_config=quantization_config)
quantizer = OVQuantizer.from_pretrained(ov_model)

def preprocess_function(examples):
return model.tokenizer(examples["sentence"], padding="max_length", max_length=384, truncation=True)

calibration_dataset = quantizer.get_calibration_dataset(
dataset_name="glue",
dataset_config_name="sst2",
preprocess_function=preprocess_function,
num_samples=300,
dataset_split="train",
)
tomaarsen marked this conversation as resolved.
Show resolved Hide resolved

save_or_push_to_hub_model(
export_function=lambda save_dir: quantizer.quantize(calibration_dataset, save_directory=save_dir, ov_config=ov_config),
export_function_name="export_static_quantized_openvino_model",
config=quantization_config,
model_name_or_path=model_name_or_path,
push_to_hub=push_to_hub,
create_pr=create_pr,
file_suffix=file_suffix,
backend="openvino",
)


def save_or_push_to_hub_model(
export_function: Callable,
export_function_name: str,
config,
model_name_or_path: str,
push_to_hub: bool = False,
create_pr: bool = False,
file_suffix: str | None = None,
backend: str = "onnx",
):
if backend == "onnx":
file_name = f"model_{file_suffix}.onnx"
elif backend == "openvino":
file_name = f"openvino_model.xml"
destination_file_name = Path(f"openvino_model_{file_suffix}.xml")

if push_to_hub:
with tempfile.TemporaryDirectory() as save_dir:
export_function(save_dir)
file_name = f"model_{file_suffix}.onnx"
source = (Path(save_dir) / file_name).as_posix()
destination = (Path("onnx") / file_name).as_posix()
if backend == "onnx":
source = (Path(save_dir) / file_name).as_posix()
destination = Path(backend) / file_name
elif backend == "openvino":
source = (Path(save_dir) / backend / file_name).as_posix()
destination = Path(backend) / destination_file_name
else:
raise NotImplementedError(f"Unsupported backend type: {backend}")

commit_description = ""
if create_pr:
Expand All @@ -230,7 +326,7 @@ def save_or_push_to_hub_onnx_model(
model = SentenceTransformer(
"{model_name_or_path}",
revision=f"refs/pr/{{pr_number}}",
backend="onnx",
backend="{backend}",
model_kwargs={{"file_name": "{destination}"}},
)

Expand All @@ -245,10 +341,10 @@ def save_or_push_to_hub_onnx_model(

huggingface_hub.upload_file(
path_or_fileobj=source,
path_in_repo=destination,
path_in_repo=destination.as_posix(),
repo_id=model_name_or_path,
repo_type="model",
commit_message=f"Add exported ONNX model {file_name!r}",
commit_message=f"Add exported {backend} model {destination.name!r}",
commit_description=commit_description,
create_pr=create_pr,
)
Expand All @@ -257,9 +353,18 @@ def save_or_push_to_hub_onnx_model(
with tempfile.TemporaryDirectory() as save_dir:
export_function(save_dir)

file_name = f"model_{file_suffix}.onnx"
source = os.path.join(save_dir, file_name)
destination = os.path.join(model_name_or_path, "onnx", file_name)
dst_dir = os.path.join(model_name_or_path, backend)
# Create destination if it does not exist
os.makedirs(os.path.dirname(destination), exist_ok=True)
shutil.copy(source, destination)
os.makedirs(dst_dir, exist_ok=True)

if backend == "openvino":
source = Path(save_dir) / backend / file_name
bin_file = source.with_suffix(".bin")
xml_destination = os.path.join(dst_dir, destination_file_name)
bin_destination = os.path.join(dst_dir, destination_file_name.with_suffix(".bin"))
shutil.copy(source, xml_destination)
shutil.copy(bin_file, bin_destination)
else:
source = os.path.join(save_dir, file_name)
destination = os.path.join(dst_dir, file_name)
shutil.copy(source, destination)
Loading