Skip to content

Commit

Permalink
don't check isinstance during iterations
Browse files Browse the repository at this point in the history
  • Loading branch information
mshannon-sil committed Apr 10, 2024
1 parent 38cdb25 commit 7942dfe
Showing 1 changed file with 2 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,11 @@ def train(
def find_missing_characters(tokenizer: Any, train_dataset: Dataset, lang_codes: List[str]) -> List[str]:
vocab = tokenizer.get_vocab().keys()
charset = set()
mpn_normalize = True if isinstance(tokenizer, (NllbTokenizerFast)) else False
for ex in train_dataset["translation"]:
for lang_code in lang_codes:
ex_text = ex[lang_code]
if isinstance(tokenizer, (NllbTokenizerFast)):
if mpn_normalize:
ex_text = self._mpn.normalize(ex_text)
ex_text = tokenizer.backend_tokenizer.normalizer.normalize_str(ex_text)
charset = charset | set(ex_text)
Expand Down

0 comments on commit 7942dfe

Please sign in to comment.