Skip to content

Commit

Permalink
Switch To BPE Backend (openvinotoolkit#235)
Browse files Browse the repository at this point in the history
* Switch default backend for Sentencepiece BPE models
* Fix BPE added tokens support
* Refactor skips for SP tokenizers
  • Loading branch information
apaniukov authored Aug 30, 2024
1 parent f9d8b7e commit 45ddc7f
Show file tree
Hide file tree
Showing 10 changed files with 14,045 additions and 7,427 deletions.
108 changes: 60 additions & 48 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >86.56</td>
<td >5698</td>
<td >88.23</td>
<td >6534</td>
</tr>
<tr>
<td >Tiktoken</td>
Expand Down Expand Up @@ -608,39 +608,33 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >SentencePiece</td>
<td >NousResearch/Llama-2-13b-hf</td>
<td >94.98</td>
<td >96.65</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >NousResearch/Llama-2-13b-hf_slow</td>
<td >NousResearch/Llama-2-13b-hf_legacy</td>
<td >100.00</td>
<td >223</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >THUDM/chatglm2-6b</td>
<td >NousResearch/Llama-2-13b-hf_sp_backend</td>
<td >100.00</td>
<td >153</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >THUDM/chatglm2-6b_slow</td>
<td >THUDM/chatglm2-6b_legacy</td>
<td >100.00</td>
<td >149</td>
<td >153</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >THUDM/chatglm3-6b</td>
<td >THUDM/chatglm3-6b_legacy</td>
<td >50.97</td>
<td >155</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >THUDM/chatglm3-6b_slow</td>
<td >49.67</td>
<td >151</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >camembert-base</td>
Expand All @@ -649,21 +643,27 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >camembert-base_slow</td>
<td >78.92</td>
<td >223</td>
<td >camembert-base_legacy</td>
<td >76.15</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >codellama/CodeLlama-7b-hf</td>
<td >100.00</td>
<td >96.65</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >codellama/CodeLlama-7b-hf_slow</td>
<td >100.00</td>
<td >223</td>
<td >codellama/CodeLlama-7b-hf_legacy</td>
<td >96.65</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >codellama/CodeLlama-7b-hf_sp_backend</td>
<td >94.98</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
Expand All @@ -673,21 +673,27 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >facebook/musicgen-small_slow</td>
<td >78.92</td>
<td >223</td>
<td >facebook/musicgen-small_legacy</td>
<td >79.92</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/Phi-3-mini-128k-instruct</td>
<td >99.17</td>
<td >95.85</td>
<td >241</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/Phi-3-mini-128k-instruct_legacy</td>
<td >95.85</td>
<td >241</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/Phi-3-mini-128k-instruct_slow</td>
<td >99.11</td>
<td >225</td>
<td >microsoft/Phi-3-mini-128k-instruct_sp_backend</td>
<td >94.19</td>
<td >241</td>
</tr>
<tr>
<td >SentencePiece</td>
Expand All @@ -697,21 +703,27 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/deberta-v3-base_slow</td>
<td >microsoft/deberta-v3-base_legacy</td>
<td >100.00</td>
<td >223</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >mlx-community/quantized-gemma-7b-it</td>
<td >96.68</td>
<td >99.17</td>
<td >241</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >mlx-community/quantized-gemma-7b-it_legacy</td>
<td >99.17</td>
<td >241</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >mlx-community/quantized-gemma-7b-it_slow</td>
<td >98.22</td>
<td >225</td>
<td >mlx-community/quantized-gemma-7b-it_sp_backend</td>
<td >100.00</td>
<td >241</td>
</tr>
<tr>
<td >SentencePiece</td>
Expand All @@ -721,9 +733,9 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >rinna/bilingual-gpt-neox-4b_slow</td>
<td >89.24</td>
<td >223</td>
<td >rinna/bilingual-gpt-neox-4b_legacy</td>
<td >86.61</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
Expand All @@ -733,9 +745,9 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >t5-base_slow</td>
<td >79.82</td>
<td >223</td>
<td >t5-base_legacy</td>
<td >81.17</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
Expand All @@ -745,9 +757,9 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >xlm-roberta-base_slow</td>
<td >97.76</td>
<td >223</td>
<td >xlm-roberta-base_legacy</td>
<td >96.23</td>
<td >239</td>
</tr>
<tr>
<td >SentencePiece</td>
Expand All @@ -757,9 +769,9 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >xlnet-base-cased_slow</td>
<td >60.99</td>
<td >223</td>
<td >xlnet-base-cased_legacy</td>
<td >59.41</td>
<td >239</td>
</tr>
<tr>
<td >Tiktoken</td>
Expand Down
16 changes: 14 additions & 2 deletions python/openvino_tokenizers/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,14 +155,25 @@ def get_parser() -> ArgumentParser:
"difference between original and OpenVINO tokenizers."
),
)
parser.add_argument(
"--use-sentencepiece-backend",
"--use_sentencepiece_backend",
required=False,
action="store_false",
help=(
"Use Sentencepiece library as a backend for tokenizer operation. "
"The repository should contain Sentencepiece `.model` file. "
"Unigram models supported by Sentencepiece backend only."
),
)
parser.add_argument(
"--handle-special-tokens-with-re",
"--handle_special_tokens_with_re",
required=False,
action="store_true",
help=(
"Use separete regex to handle special tokens for sentencepiece-based tokenizers. Use this option if the "
"converted tokenizer doesn't use special tokens during tokenization."
"Use a regex to handle special tokens for tokenizers with Sentencepiece backed. "
"Use this option if the converted tokenizer doesn't recognize special tokens during tokenization."
),
)
parser.add_argument(
Expand Down Expand Up @@ -243,6 +254,7 @@ def convert_hf_tokenizer() -> None:
streaming_detokenizer=args.streaming_detokenizer,
use_max_padding=args.max_padding is not None,
handle_special_tokens_with_re=args.handle_special_tokens_with_re,
use_sentencepiece_backend=args.use_sentencepiece_backend,
)
if not isinstance(converted, tuple):
converted = (converted,)
Expand Down
6 changes: 5 additions & 1 deletion python/openvino_tokenizers/convert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def convert_tokenizer(
streaming_detokenizer: bool = False,
use_max_padding: bool = False,
handle_special_tokens_with_re: Optional[bool] = None,
use_sentencepiece_backend: bool = False,
) -> Union[Model, Tuple[Model, Model]]:
ov_tokenizers = None

Expand All @@ -36,12 +37,15 @@ def convert_tokenizer(
convert_fast_tokenizer,
convert_sentencepiece_model_tokenizer,
convert_tiktoken_model_tokenizer,
is_sentencepiece_bpe_model,
is_sentencepiece_model,
is_tiktoken_model,
)

can_use_sentencepiece = is_sentencepiece_model(tokenizer_object)
is_unigram = can_use_sentencepiece and not is_sentencepiece_bpe_model(tokenizer_object)
if isinstance(tokenizer_object, PreTrainedTokenizerBase):
if is_sentencepiece_model(tokenizer_object):
if can_use_sentencepiece and (is_unigram or not tokenizer_object.is_fast or use_sentencepiece_backend):
logger.info("Convert tokenizer using SentencePiece .model file.")
ov_tokenizers = convert_sentencepiece_model_tokenizer(
tokenizer_object,
Expand Down
44 changes: 43 additions & 1 deletion python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2022 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import functools
import json
import sys
import tempfile
Expand Down Expand Up @@ -127,6 +127,27 @@ def parse_byte_level_pretokenization_step(
return steps


def parse_metaspace(pretokenizer_dict: Dict[str, Any]) -> List[Union[NormalizationStep, PreTokenizatinStep]]:
steps = []

# old prefix adder
if pretokenizer_dict.get("add_prefix_space"):
steps.append(RegexNormalizationStep.add_prefix_whitespace_regex())

replacement = pretokenizer_dict.get("replacement", "▁")
steps.append(RegexNormalizationStep.replace_spaces_metaspace(replacement))

# new prefix adder
prepend_scheme = pretokenizer_dict.get("prepend_scheme", "never")
if prepend_scheme != "never":
steps.append(RegexNormalizationStep.prepend_with_check_regex(replacement, replacement))

if pretokenizer_dict.get("split", False):
steps.append(RegexSplitStep.metaspace_splitter(replacement))

return steps


class TransformersTokenizerPipelineParser:
def __init__(self, tokenizer_object: Any, number_of_inputs: int = 1, add_special_tokens: bool = True) -> None:
if not tokenizer_object.is_fast:
Expand Down Expand Up @@ -191,6 +212,24 @@ def parse_normalizer_step(self, step_dict: Dict[str, Any]) -> None:
except KeyError:
raise OVTypeError(f"Normalizer type '{step_dict['type']}' is not supported")

@staticmethod
def check_metaspace_normalizer(normalizer_dict: Dict[str, Any]) -> bool:
if normalizer_dict.get("type") == "Sequence":
normalizers = normalizer_dict["normalizers"]

if len(normalizers) != 2:
return False
first, second = normalizers
first_prerend = bool(first.get("type") == "Prepend" and first.get("prepend") == "▁")
second_replace = bool(
second.get("type") == "Replace"
and second.get("pattern", {}).get("String") == " "
and second.get("content") == "▁"
)
return first_prerend and second_replace

return False

def normalization(self) -> None:
if self.tokenizer_json["normalizer"] is None:
return
Expand All @@ -214,6 +253,7 @@ def normalization(self) -> None:
"Digits": lambda step_dict: RegexSplitStep.digits_splitter(
"isolate" if step_dict["individual_digits"] else "contiguous"
),
"Metaspace": parse_metaspace,
}

def parse_pre_tokenization_step(self, step_dict: Dict[str, Any]) -> None:
Expand Down Expand Up @@ -448,6 +488,7 @@ def convert_fast_tokenizer(
return tokenizer_model


@functools.lru_cache(1)
def is_sentencepiece_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
with tempfile.TemporaryDirectory() as tmp:
try:
Expand Down Expand Up @@ -479,6 +520,7 @@ def is_sentencepiece_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
return False


@functools.lru_cache(1)
def is_sentencepiece_bpe_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
with tempfile.TemporaryDirectory() as tmp:
hf_tokenizer.save_pretrained(tmp)
Expand Down
Loading

0 comments on commit 45ddc7f

Please sign in to comment.