Skip to content

Commit

Permalink
Store tokenizer conversion params in rt_info / refactor passing params (
Browse files Browse the repository at this point in the history
openvinotoolkit#268)

* WIP

* refactor passing parameters as a single object, stored rt_info

* ready for review

* apply review comments

* apply ruff format

* fix what ruff has broken

* fix docstring

* fix tests

* add tests

* Update python/openvino_tokenizers/convert_tokenizer.py

Co-authored-by: Artur Paniukov <[email protected]>

* add barier for specifying both params and individual conversion arguments

* simplify more

* fix tests 2

* move imports

* fix tests 3

---------

Co-authored-by: Artur Paniukov <[email protected]>
  • Loading branch information
pavel-esir and apaniukov authored Oct 7, 2024
1 parent e74460f commit 298e1ee
Show file tree
Hide file tree
Showing 10 changed files with 4,529 additions and 4,418 deletions.
20 changes: 15 additions & 5 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ def batch_iter(dataset: Iterable, batch: int = 1):
yield next_batch


def benchmark_tokenizer_async(ov_tokenizer: CompiledModel, dataset: List[Tuple[str, str]], batch: int = 1) -> Tuple[pd.Series, float]:
def benchmark_tokenizer_async(
ov_tokenizer: CompiledModel, dataset: List[Tuple[str, str]], batch: int = 1
) -> Tuple[pd.Series, float]:
def callback(
ir: InferRequest,
user_data: Tuple[List[int], float, int],
Expand All @@ -53,7 +55,9 @@ def callback(
times = [0 for _ in range(iterations)]

bench_start = perf_counter()
for idx, prompt in tqdm(enumerate(batch_iter(chain.from_iterable(dataset), batch)), total=iterations, desc="Async benchmark"):
for idx, prompt in tqdm(
enumerate(batch_iter(chain.from_iterable(dataset), batch)), total=iterations, desc="Async benchmark"
):
start = perf_counter()
async_queue.start_async(prompt, (times, start, idx))
async_queue.wait_all()
Expand Down Expand Up @@ -91,7 +95,9 @@ def benchmark_tokenizers(
hf_tokenizer(["test " * repeat])

ov_perf_counters = []
for prompt in tqdm(batch_iter(chain.from_iterable(dataset), batch), total=len(dataset) * 2 / batch, desc="Sync benchmark"):
for prompt in tqdm(
batch_iter(chain.from_iterable(dataset), batch), total=len(dataset) * 2 / batch, desc="Sync benchmark"
):
res = [prompt]

ov_start = perf_counter()
Expand Down Expand Up @@ -136,7 +142,9 @@ def dump_latency_stats(results: pd.DataFrame, model_name: str) -> None:
sorted_res.to_csv(f"latency_res_{model_name}.csv", index=False)


def print_stats(results: pd.DataFrame, async_fps: Optional[float] = None, batch: int = 1) -> Tuple[float, float, float]:
def print_stats(
results: pd.DataFrame, async_fps: Optional[float] = None, batch: int = 1
) -> Tuple[float, float, float]:
data_size = len(results) * batch
ov_fps = data_size / results["OV"].sum()
hf_fps = data_size / results["HF"].sum()
Expand Down Expand Up @@ -205,7 +213,9 @@ def main(
result_df = benchmark_tokenizers(ov_tokenizer, hf_tokenizer, dataset, per_layer_stats, batch)
async_results, async_fps = benchmark_tokenizer_async(ov_tokenizer, dataset, batch)
result_df = result_df.assign(OV_ASYNC=async_results.values)
result_df["Prompt Length, chars"] = result_df["prompt"].apply(lambda prompts: sum(len(prompt) for prompt in prompts))
result_df["Prompt Length, chars"] = result_df["prompt"].apply(
lambda prompts: sum(len(prompt) for prompt in prompts)
)

ov_fps, async_fps, hf_fps = print_stats(result_df, async_fps, batch)
model_name = checkpoint.rsplit("/", 1)[-1]
Expand Down
1 change: 1 addition & 0 deletions python/openvino_tokenizers/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
MIN_CACHE_CAPACITY = 20_000
VOCAB_SIZE_CACHE_PROPORTION = 0.2


class UTF8ReplaceMode(Enum):
IGNORE: str = "ignore"
REPLACE: str = "replace"
Expand Down
164 changes: 99 additions & 65 deletions python/openvino_tokenizers/convert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,60 @@
import logging
import sys
from typing import Any, Optional, Tuple, Union
from functools import wraps

from openvino.runtime import Model, Type
from openvino.runtime.exceptions import OVTypeError

from openvino_tokenizers.constants import UTF8ReplaceMode
from openvino_tokenizers.utils import change_inputs_type, change_outputs_type, update_rt_info

from openvino_tokenizers.utils import (
change_inputs_type,
change_outputs_type,
update_rt_info,
TokenzierConversionParams,
)

logger = logging.getLogger(__name__)


def capture_arg(func):

@wraps(func)
def wrapper(*args, **kwargs):
params = None
if len(args) > 1 and args[1] != None:
params = args[1]
if 'params' in kwargs:
params = kwargs['params']

if params is not None:
for key in TokenzierConversionParams.__match_args__:
if kwargs.get(key) is not None:
msg = (
"Cannot specify both 'params' and individual convert_tokenizer arguments simultaneously. "
"Please pass all conversion params either individually, e.g. "
"convert_tokenizer(tokenizer_object, with_detokenizr=True, add_special_tokens=True,...). "
"Or within 'params' argument, e.g. "
"convert_tokenzier(tokenizer_object, params={'with_detokenizr': True, 'add_special_tokens': True, ...})"
)
raise ValueError(msg)

if isinstance(params, dict):
params = TokenzierConversionParams(**params)
if params is None:
params = TokenzierConversionParams(**kwargs)
return func(args[0], params)

# Embed convert_tokenizer docstring with TokenzierConversionParams docstring.
wrapper.__doc__ = func.__doc__.replace('Returns:', 'Returns:\n'+ TokenzierConversionParams.__doc__ + '\n')
return wrapper


@capture_arg
def convert_tokenizer(
tokenizer_object: Any,
params: Union[TokenzierConversionParams, dict] = None,
*,
with_detokenizer: bool = False,
add_special_tokens: bool = True,
skip_special_tokens: bool = True,
Expand All @@ -30,80 +71,73 @@ def convert_tokenizer(
use_sentencepiece_backend: bool = False,
utf8_replace_mode: Optional[UTF8ReplaceMode] = None,
) -> Union[Model, Tuple[Model, Model]]:
ov_tokenizers = None
"""
Converts a given tokenizer object into an OpenVINO-compatible model.
if "transformers" in sys.modules:
from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
If no `params` are provided, the function will construct a `TokenzierConversionParams` instance
using the passed keyword arguments to control the behavior of the conversion. Either params,
or keyword arguments should be passed, if both are specified an Error will be thrown.
from .hf_parser import (
convert_fast_tokenizer,
convert_sentencepiece_model_tokenizer,
convert_tiktoken_model_tokenizer,
is_sentencepiece_bpe_model,
is_sentencepiece_model,
is_tiktoken_model,
)
Parameters:
-----------
tokenizer_object : Any
The tokenizer object to convert. This should be an instance of a supported tokenizer, such
as Huggingface's `PreTrainedTokenizerBase` or `PreTrainedTokenizerFast`.
can_use_sentencepiece = is_sentencepiece_model(tokenizer_object)
is_unigram = can_use_sentencepiece and not is_sentencepiece_bpe_model(tokenizer_object)
if isinstance(tokenizer_object, PreTrainedTokenizerBase):
if can_use_sentencepiece and (is_unigram or not tokenizer_object.is_fast or use_sentencepiece_backend):
logger.info("Convert tokenizer using SentencePiece .model file.")
ov_tokenizers = convert_sentencepiece_model_tokenizer(
tokenizer_object,
add_attention_mask=True,
with_detokenizer=with_detokenizer,
streaming_detokenizer=streaming_detokenizer,
add_special_tokens=add_special_tokens,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
handle_special_tokens_with_re=handle_special_tokens_with_re,
utf8_replace_mode=utf8_replace_mode,
)
elif is_tiktoken_model(tokenizer_object):
logger.info("Convert tiktoken-based tokenizer")
ov_tokenizers = convert_tiktoken_model_tokenizer(
tokenizer_object,
with_detokenizer=with_detokenizer,
add_special_tokens=add_special_tokens,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
use_max_padding=use_max_padding,
utf8_replace_mode=utf8_replace_mode,
)
elif isinstance(tokenizer_object, PreTrainedTokenizerFast):
logger.info("Convert Huggingface Fast tokenizer pipeline.")
ov_tokenizers = convert_fast_tokenizer(
tokenizer_object,
number_of_inputs=1,
with_detokenizer=with_detokenizer,
add_special_tokens=add_special_tokens,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
use_max_padding=use_max_padding,
utf8_replace_mode=utf8_replace_mode,
)
else:
raise OVTypeError(f"Huggingface tokenizer type is not supported: {type(tokenizer_object)}")

if isinstance(ov_tokenizers, tuple):
for ov_model in ov_tokenizers:
update_rt_info(ov_model, tokenizer_object)
else:
update_rt_info(ov_tokenizers, tokenizer_object)
else:
params : TokenzierConversionParams, optional
If provided, the `TokenzierConversionParams` object containing conversion parameters.
If not provided, the parameters will be constructed from the other keyword arguments.
Returns:
--------
Union[Model, Tuple[Model, Model]]
The converted tokenizer model, or a tuple tokenizer and detokenizer depending on with_detokenizer value.
"""
ov_tokenizers = None

if "transformers" not in sys.modules:
raise EnvironmentError(
"No transformers library in the environment. Install required dependencies with one of two options:\n"
"1. pip install openvino-tokenizers[transformers]\n"
"2. pip install transformers[sentencepiece] tiktoken\n"
)

from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
from .hf_parser import (
convert_fast_tokenizer,
convert_sentencepiece_model_tokenizer,
convert_tiktoken_model_tokenizer,
is_sentencepiece_bpe_model,
is_sentencepiece_model,
is_tiktoken_model,
)

can_use_sentencepiece = is_sentencepiece_model(tokenizer_object)
is_unigram = can_use_sentencepiece and not is_sentencepiece_bpe_model(tokenizer_object)
if isinstance(tokenizer_object, PreTrainedTokenizerBase):
if can_use_sentencepiece and (is_unigram or not tokenizer_object.is_fast or params.use_sentencepiece_backend):
logger.info("Convert tokenizer using SentencePiece .model file.")
ov_tokenizers = convert_sentencepiece_model_tokenizer(tokenizer_object, params)
elif is_tiktoken_model(tokenizer_object):
logger.info("Convert tiktoken-based tokenizer")
ov_tokenizers = convert_tiktoken_model_tokenizer(tokenizer_object, params)
elif isinstance(tokenizer_object, PreTrainedTokenizerFast):
logger.info("Convert Huggingface Fast tokenizer pipeline.")
ov_tokenizers = convert_fast_tokenizer(tokenizer_object, params)
else:
raise OVTypeError(f"Huggingface tokenizer type is not supported: {type(tokenizer_object)}")

if isinstance(ov_tokenizers, tuple):
for ov_model in ov_tokenizers:
update_rt_info(ov_model, tokenizer_object, params)
else:
update_rt_info(ov_tokenizers, tokenizer_object, params)

if ov_tokenizers is None:
raise OVTypeError(f"Tokenizer type is not supported: {type(tokenizer_object)}")

if isinstance(ov_tokenizers, tuple):
return (
change_outputs_type(ov_tokenizers[0], tokenizer_output_type),
change_inputs_type(ov_tokenizers[1], detokenizer_input_type),
change_outputs_type(ov_tokenizers[0], params.tokenizer_output_type),
change_inputs_type(ov_tokenizers[1], params.detokenizer_input_type),
)
return change_outputs_type(ov_tokenizers, tokenizer_output_type)
return change_outputs_type(ov_tokenizers, params.tokenizer_output_type)
Loading

0 comments on commit 298e1ee

Please sign in to comment.