Skip to content

Commit

Permalink
update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
sanchit-gandhi committed Sep 28, 2023
1 parent d29369d commit 0baebe7
Showing 1 changed file with 3 additions and 5 deletions.
8 changes: 3 additions & 5 deletions tests/models/whisper/test_tokenization_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,26 +278,24 @@ def test_timestamp_encoding(self):
rust_tokenizer = self.get_rust_tokenizer()

input_text = "<|0.00|> Whisper can do timestamps?<|2.60|>"
target_text = "<|startoftranscript|><|notimestamps|><|0.00|> Whisper can do timestamps?<|2.60|><|endoftext|>"

# fmt: off
EXPECTED_TOKENS = [
START_OF_TRANSCRIPT, NOTIMESTAMPS, NOTIMESTAMPS + 1, 2471, 271, 610,
393, 360, 220, 31208, 377, 23150, 30, 50494, END_OF_TRANSCRIPT,
NOTIMESTAMPS + 1, 2471, 271, 610, 393, 360, 220, 31208, 377, 23150, 30, 50494,
]
# fmt: on

encoding = tokenizer(input_text, split_special_tokens=False).input_ids
decoding = tokenizer.decode(encoding)

self.assertEqual(EXPECTED_TOKENS, encoding)
self.assertEqual(target_text, decoding)
self.assertEqual(input_text, decoding)

encoding = rust_tokenizer(input_text, split_special_tokens=False)
decoding = rust_tokenizer.decode(encoding)

self.assertEqual(EXPECTED_TOKENS, encoding)
self.assertEqual(target_text, decoding)
self.assertEqual(input_text, decoding)


class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
Expand Down

0 comments on commit 0baebe7

Please sign in to comment.