Skip to content

Commit

Permalink
Add bits and sym parameters to the OV quantization config (#560)
Browse files Browse the repository at this point in the history
* Add bits and sym parameters to the OV quantization config

* format

* add nncf version

* Fix config saving

* add ov config test

* remove load_in_4bit argument

* add weight only quant for int8

* fix style

* add nncf check

* remove  _int4_weight_only_quantization

* fix typo

* fix style
  • Loading branch information
echarlaix authored Feb 15, 2024
1 parent 7e1a21e commit 6c8fa79
Show file tree
Hide file tree
Showing 9 changed files with 238 additions and 232 deletions.
2 changes: 2 additions & 0 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,8 @@ def export_models(
Returns:
list of input_names and output_names from ONNX configuration
"""

# TODO : modify compression_option to quantization_config
outputs = []

if output_names is not None and len(output_names) != len(models_and_onnx_configs):
Expand Down
3 changes: 1 addition & 2 deletions optimum/intel/openvino/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,10 @@

patch_torch_operators()

from .configuration import OVConfig
from .configuration import OVConfig, OVWeightQuantizationConfig
from .quantization import OVQuantizer
from .trainer import OVTrainer
from .training_args import OVTrainingArguments
from .weight_quantization import OVWeightQuantizationConfig

from .modeling import (
OVModelForAudioClassification,
Expand Down
116 changes: 113 additions & 3 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, List, Optional, Union
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Union

import torch
from transformers import PretrainedConfig
from transformers.utils.quantization_config import QuantizationConfigMixin

from optimum.configuration_utils import BaseConfig

from .weight_quantization import OVWeightQuantizationConfig


DEFAULT_QUANTIZATION_CONFIG = {
"algorithm": "quantization",
Expand Down Expand Up @@ -77,6 +77,28 @@
}


DEFAULT_4BIT_CONFIGS = {
"databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5},
"EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64},
"facebook/opt-6.7b": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
"bigscience/bloomz-7b1": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.6},
"togethercomputer/RedPajama-INCITE-7B-Instruct": {"bits": 4, "sym": False, "group_size": 128},
"HuggingFaceH4/zephyr-7b-beta": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.6},
"meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
"meta-llama/Llama-2-7b-chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
"meta-llama/Llama-2-13b-chat": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
"stabilityai/stablelm-3b-4e1t": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
"stablelm-epoch-3b-preview": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
"stable-zephyr-3b-dpo": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
"pansophic/rocket-3B": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
"THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72},
"Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
"openlm-research/open_llama_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
"tiiuae/falcon-7b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
"psmathur/orca_mini_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
}


class OVConfig(BaseConfig):
CONFIG_NAME = "openvino_config.json"
FULL_CONFIGURATION_FILE = "openvino_config.json"
Expand Down Expand Up @@ -127,3 +149,91 @@ def _enable_standard_onnx_export_option(self):
for i, algo_config in enumerate(self.compression):
if algo_config["algorithm"] == "quantization":
self.compression[i]["export_to_onnx_standard_ops"] = self.save_onnx_model


@dataclass
class OVWeightQuantizationConfig(QuantizationConfigMixin):
"""
This is a wrapper class about all possible attributes and features that you can play with a model that has been
loaded using `optimum-intel` api for quantization with NNCF.
Args:
bits (`int`, defaults to 8):
The number of bits to quantize to.
sym (`bool`, *optional*, defaults to `False`):
Whether to use symetric quantization.
tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
The tokenizer used to process the dataset. You can pass either:
- A custom tokenizer object.
- A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
user or organization name, like `dbmdz/bert-base-german-cased`.
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
dataset (`Union[List[str]]`, *optional*):
The dataset used for data-aware compression. You can provide your own dataset in a list of string or just use the
the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new']
group_size (`int`, *optional*, defaults to 128):
The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
ratio (`float`, *optional*, defaults to 1.0):
The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
and the rest to INT8_ASYM).
all_layers (`bool`, *optional*):
Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion.
sensitivity_metric (`nncf.SensitivityMetric`, *optional*):
The sensitivity metric for assigning quantization precision to layers. In order to
preserve the accuracy of the model, the more sensitive layers receives a higher precision.
awq (`bool`, *optional*):
Enables AWQ method to unify weight ranges and improve overall model accuracy.
ignored_scope (`nncf.IgnoredScope`, *optional*):
An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization.
"""

def __init__(
self,
bits: int = 8,
sym: bool = False,
tokenizer: Any = None,
dataset: Optional[str] = None,
ratio: Optional[float] = None,
group_size: Optional[int] = None,
all_layers: Optional[bool] = None,
sensitivity_metric: Optional[str] = None,
ignored_scope: Optional[dict] = None,
**kwargs,
):
self.bits = bits
self.sym = sym
self.tokenizer = tokenizer
self.dataset = dataset
self.group_size = group_size
self.ratio = ratio
self.all_layers = all_layers
self.sensitivity_metric = sensitivity_metric
self.ignored_scope = ignored_scope
self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release
self.post_init()

def post_init(self):
r"""
Safety checker that arguments are correct
"""
if self.ratio is not None and not (0 <= self.ratio <= 1):
raise ValueError("damp_percent must between 0 and 1.")
if self.group_size is not None and self.group_size != -1 and self.group_size <= 0:
raise ValueError("group_size must be greater than 0 or equal to -1")
if self.dataset is not None and isinstance(self.dataset, str):
if self.dataset not in ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]:
raise ValueError(
f"""You have entered a string value for dataset. You can only choose between
['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}"""
)

if self.bits not in [4, 8]:
raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}")


def _check_default_4bit_configs(config: PretrainedConfig):
return DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None)
6 changes: 0 additions & 6 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ def _from_pretrained(
from_onnx: bool = False,
local_files_only: bool = False,
load_in_8bit: bool = False,
load_in_4bit: bool = False,
**kwargs,
):
"""
Expand Down Expand Up @@ -185,11 +184,7 @@ def _from_pretrained(
Whether or not to only look at local files (i.e., do not try to download the model).
load_in_8bit (`bool`, *optional*, defaults to `False`):
Whether or not to apply 8-bit weight quantization.
load_in_4bit (`bool`, *optional*, defaults to `False`):
Whether or not to apply 4-bit weight quantization.
"""
if load_in_4bit:
raise ValueError("load_in_4bit is available for OVModelForCausalLM only.")
model_path = Path(model_id)
default_file_name = ONNX_WEIGHTS_NAME if from_onnx else OV_XML_FILE_NAME
file_name = file_name or default_file_name
Expand Down Expand Up @@ -257,7 +252,6 @@ def _from_transformers(
task: Optional[str] = None,
trust_remote_code: bool = False,
load_in_8bit: Optional[bool] = None,
load_in_4bit: Optional[bool] = None,
**kwargs,
):
"""
Expand Down
30 changes: 22 additions & 8 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@

from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful
from ...exporters.openvino.stateful import model_has_state
from ..utils.import_utils import is_nncf_available
from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
from .configuration import OVWeightQuantizationConfig, _check_default_4bit_configs
from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
from .weight_quantization import OVWeightQuantizationConfig, compress_decoder_weights


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -238,7 +239,6 @@ def _from_transformers(
use_cache: bool = True,
trust_remote_code: bool = False,
load_in_8bit: Optional[bool] = None,
load_in_4bit: Optional[bool] = None,
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
**kwargs,
):
Expand All @@ -258,8 +258,9 @@ def _from_transformers(

# If load_in_8bit is not specified then compression_option should be set to None and will be set by default in main_export depending on the model size
compression_option = None
if load_in_8bit is not None or load_in_4bit is not None:
if load_in_8bit is not None or quantization_config is not None:
compression_option = "fp32"

stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
main_export(
model_name_or_path=model_id,
Expand All @@ -285,7 +286,6 @@ def _from_transformers(
use_cache=use_cache,
load_in_8bit=load_in_8bit,
stateful=None,
load_in_4bit=load_in_4bit,
quantization_config=quantization_config,
**kwargs,
)
Expand Down Expand Up @@ -556,7 +556,6 @@ def _from_pretrained(
from_onnx: bool = False,
local_files_only: bool = False,
load_in_8bit: bool = False,
load_in_4bit: bool = False,
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
**kwargs,
):
Expand All @@ -575,8 +574,10 @@ def _from_pretrained(
local_files_only=local_files_only,
)

if load_in_8bit and load_in_4bit:
raise ValueError("Either load_in_8bit or load_in_4bit should be set to True.")
if isinstance(quantization_config, dict):
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)

load_in_4bit = quantization_config.bits == 4 if quantization_config else False
model = cls.load_model(model_cache_path, load_in_8bit=False if load_in_4bit else load_in_8bit)

model_type = config.model_type.replace("_", "-")
Expand All @@ -594,7 +595,20 @@ def _from_pretrained(
causal_model = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)

if load_in_4bit:
compress_decoder_weights(causal_model, quantization_config)
if not is_nncf_available():
raise ImportError(
"Quantization of the weights requires nncf, please install it with `pip install nncf`"
)
from .quantization import _weight_only_quantization

default_config = _check_default_4bit_configs(config)

if default_config:
logger.info(
f"For the given model, we recommend the following `quantization_config` : {default_config}"
)

_weight_only_quantization(causal_model, quantization_config)
return causal_model


Expand Down
Loading

0 comments on commit 6c8fa79

Please sign in to comment.