Skip to content

Commit

Permalink
Del Unused Regex
Browse files Browse the repository at this point in the history
  • Loading branch information
apaniukov committed Nov 11, 2024
1 parent 9554fe5 commit 7055c35
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
7 changes: 0 additions & 7 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,13 +231,6 @@ def del_control_chars_regex(cls) -> "RegexNormalizationStep":
replace_term="",
)

@classmethod
def clean_up_tokenization_spaces(cls) -> "RegexNormalizationStep":
return cls(
regex_search_pattern=r"(?| ([\.\?\!\,])| ('[ms])| (') | ('[rv]e))",
replace_term="$1",
)

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
input_nodes.extend(
(
Expand Down
8 changes: 8 additions & 0 deletions tests/layer_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,14 @@ def test_charsmap_normalizartion(test_string, hf_charsmap_tokenizer, precompiled
(" Hello world!", " Hello world!", RegexNormalizationStep.add_prefix_whitespace_to_not_whitespace_regex()),
("\tHello world!", " \tHello world!", RegexNormalizationStep.add_prefix_whitespace_to_not_whitespace_regex()),
("\tHello", "▁\tHello", RegexNormalizationStep.prepend_regex("▁")),
( # test backward compatibility with old regex
" ' declare",
"'declare",
RegexNormalizationStep(
regex_search_pattern=r" ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)",
replace_term=r"\1",
)
),
]
)
def test_regex_normalization(test_string, expected, layer):
Expand Down

0 comments on commit 7055c35

Please sign in to comment.