Skip to content

Commit

Permalink
As slow->fast token failed due to the different initialization for [M…
Browse files Browse the repository at this point in the history
…ASK] for slow and fast, Therefore it required to make the initialization for [MASK] token uniform between fast and slow token
  • Loading branch information
Kokane committed Nov 28, 2023
1 parent 53a02c5 commit 913c5a1
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
5 changes: 4 additions & 1 deletion src/transformers/models/rembert/tokenization_rembert.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import sentencepiece as spm

from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging


Expand Down Expand Up @@ -111,6 +111,9 @@ def __init__(
mask_token="[MASK]",
**kwargs,
):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken("[MASK]", lstrip=True, rstrip=False, normalized=False)

self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
Expand Down
3 changes: 0 additions & 3 deletions tests/models/rembert/test_tokenization_rembert.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,3 @@ def test_sequence_builders(self):
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
tokenizer.sep_token_id
]

def test_added_tokens_serialization(self):
pass

0 comments on commit 913c5a1

Please sign in to comment.