From 913c5a1a7db7eeedb1abb315a40647b352c1e743 Mon Sep 17 00:00:00 2001 From: Kokane Date: Tue, 28 Nov 2023 20:44:19 +0530 Subject: [PATCH] As slow->fast token failed due to the different initialization for [MASK] for slow and fast, Therefore it required to make the initialization for [MASK] token uniform between fast and slow token --- src/transformers/models/rembert/tokenization_rembert.py | 5 ++++- tests/models/rembert/test_tokenization_rembert.py | 3 --- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/rembert/tokenization_rembert.py b/src/transformers/models/rembert/tokenization_rembert.py index c1f12527ef5974..d1bdc572cedc8c 100644 --- a/src/transformers/models/rembert/tokenization_rembert.py +++ b/src/transformers/models/rembert/tokenization_rembert.py @@ -21,7 +21,7 @@ import sentencepiece as spm -from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_utils import AddedToken, PreTrainedTokenizer from ...utils import logging @@ -111,6 +111,9 @@ def __init__( mask_token="[MASK]", **kwargs, ): + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken("[MASK]", lstrip=True, rstrip=False, normalized=False) + self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents diff --git a/tests/models/rembert/test_tokenization_rembert.py b/tests/models/rembert/test_tokenization_rembert.py index 6038591e82f464..41e2b2fc27a95b 100644 --- a/tests/models/rembert/test_tokenization_rembert.py +++ b/tests/models/rembert/test_tokenization_rembert.py @@ -160,6 +160,3 @@ def test_sequence_builders(self): assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [ tokenizer.sep_token_id ] - - def test_added_tokens_serialization(self): - pass