From dbbdf3f1f669841002bc87ddcc713bbc3874804a Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Fri, 29 Sep 2023 16:21:30 +0100 Subject: [PATCH] final fix --- src/transformers/models/wav2vec2/tokenization_wav2vec2.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index f4715315a705a4..04e5c3fa59b4c1 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -206,9 +206,7 @@ def __init__( # make sure that tokens made of several # characters are not split at tokenization - for token in self.encoder.keys(): - if len(token) > 1: - self.add_tokens(AddedToken(token, rstrip=True, lstrip=True, normalized=False)) + self.add_tokens([token for token in self.encoder.keys() if len(token) > 1]) def set_target_lang(self, target_lang: str): """ @@ -227,9 +225,7 @@ def set_target_lang(self, target_lang: str): # make sure that tokens made of several # characters are not split at tokenization - for token in self.encoder.keys(): - if len(token) > 1: - self.add_tokens(token) + self.add_tokens([token for token in self.encoder.keys() if len(token) > 1]) @property def word_delimiter_token(self) -> str: