Skip to content

Commit

Permalink
Update multilingual_tokenizer.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ryback123 authored Sep 26, 2024
1 parent bca3360 commit 89d12f3
Showing 1 changed file with 4 additions and 9 deletions.
13 changes: 4 additions & 9 deletions nemo/collections/common/tokenizers/multilingual_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,10 @@ def ids_to_text(self, ids, lang):
ids = ids.tolist()

tokens = []
tokenizer = self.tokenizers_dict[lang]
for id in ids:
# offset_id = self.offset_token_ids_by_token_id[id]
# tokenizer = self.tokenizers_by_token_id[id]
tokenizer = self.tokenizers_dict[lang]
# tokens.extend(tokenizer.ids_to_tokens([offset_id]))
tokens.extend(tokenizer.ids_to_tokens([id]))
text = ''.join(tokens).replace('▁', ' ')
Expand All @@ -131,14 +131,9 @@ def token_to_id(self, token, lang_id):
tokenizer = self.tokenizers_dict[lang_id]
return tokenizer.token_to_id(token) + self.token_id_offset[lang_id]

def ids_to_tokens(self, ids):
tokens = []

for id in ids:
offset_id = self.offset_token_ids_by_token_id[id]
tokenizer = self.tokenizers_by_token_id[id]
token = tokenizer.ids_to_tokens([offset_id])[0]
tokens.append(token)
def ids_to_tokens(self, ids, lang_id):
tokenizer = self.tokenizers_dict[lang_id]
tokens = [tokenizer.ids_to_tokens([id])[0] for id in ids]

return tokens

Expand Down

0 comments on commit 89d12f3

Please sign in to comment.