From b9c6bea75e4c1301f128fab47eb7494f3d2218c2 Mon Sep 17 00:00:00 2001 From: devfon <31345506+Huffon@users.noreply.github.com> Date: Wed, 13 Jan 2021 06:07:59 +0900 Subject: [PATCH] Add `fuse_unk` option to SentencePieceBPETokenizer (#574) * Add fuse_unk option to SentencePieceBPETokenizer * Fix style Co-authored-by: Anthony MOI --- .../py_src/tokenizers/implementations/sentencepiece_bpe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py index f9e424964..1a64213cc 100644 --- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py +++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py @@ -20,9 +20,12 @@ def __init__( replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None, + fuse_unk: Optional[bool] = False, ): if vocab is not None and merges is not None: - tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token)) + tokenizer = Tokenizer( + BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk) + ) else: tokenizer = Tokenizer(BPE())