From 58b8346ff113064b1ef478428dba96516049c035 Mon Sep 17 00:00:00 2001 From: aravindMahadevan Date: Tue, 11 Jun 2024 02:11:38 +0000 Subject: [PATCH 1/5] support user defined tokens by bounding timestamp token if statement --- src/tokenizers.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 234eef15e..58e47d25b 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3610,6 +3610,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer { let chunk = new_chunk(); let time_offset = 0.0; const timestamp_begin = this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0] + 1; + const timestamp_end = this.model.convert_tokens_to_ids(["<|30.00|>"]); let previous_tokens = []; let previous_token_timestamps = []; @@ -3697,7 +3698,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer { } else { // 2/ This is a regular special token, ignoring it } - } else if (token >= timestamp_begin) { + } else if (token >= timestamp_begin && token <= timestamp_end) { // 3/ Timestamp token const time = (token - timestamp_begin) * time_precision + time_offset; const rounded_time = round(time, 2); From 7d0cdbf79f7c93864e6e2a651acb11110d0238d1 Mon Sep 17 00:00:00 2001 From: aravindMahadevan <15685389+aravindMahadevan@users.noreply.github.com> Date: Tue, 11 Jun 2024 13:57:18 -0400 Subject: [PATCH 2/5] Update src/tokenizers.js Co-authored-by: Joshua Lochner --- src/tokenizers.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 58e47d25b..599338ab4 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3610,7 +3610,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer { let chunk = new_chunk(); let time_offset = 0.0; const timestamp_begin = this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0] + 1; - const timestamp_end = this.model.convert_tokens_to_ids(["<|30.00|>"]); + const timestamp_end = this.model.convert_tokens_to_ids(["<|30.00|>"])[0]; let previous_tokens = []; let previous_token_timestamps = []; From e6e7e93c4e483d0709840766783f3828b1a77fd0 Mon Sep 17 00:00:00 2001 From: aravindMahadevan Date: Fri, 14 Jun 2024 02:33:38 +0000 Subject: [PATCH 3/5] calculate timestamp_end instead of hardcoding --- src/tokenizers.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 599338ab4..19bad87cc 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3610,7 +3610,11 @@ export class WhisperTokenizer extends PreTrainedTokenizer { let chunk = new_chunk(); let time_offset = 0.0; const timestamp_begin = this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0] + 1; - const timestamp_end = this.model.convert_tokens_to_ids(["<|30.00|>"])[0]; + // Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments. + // We can calculate the last time stamp token as timestamp_begin plus the number of tokens + // tokens from 0.00 to 30.00 which is 1500. + const total_timestamp_tokens = (30.00 - 0.00) / 0.02 + const timestamp_end = timestamp_begin + total_timestamp_tokens let previous_tokens = []; let previous_token_timestamps = []; From 03b2c402d9d9ffc4c54443284b9b1cb9dd6db104 Mon Sep 17 00:00:00 2001 From: aravindMahadevan <15685389+aravindMahadevan@users.noreply.github.com> Date: Fri, 14 Jun 2024 10:32:30 -0400 Subject: [PATCH 4/5] Update tokenizers.js --- src/tokenizers.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index 19bad87cc..ada58a8fe 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3613,8 +3613,8 @@ export class WhisperTokenizer extends PreTrainedTokenizer { // Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments. // We can calculate the last time stamp token as timestamp_begin plus the number of tokens // tokens from 0.00 to 30.00 which is 1500. - const total_timestamp_tokens = (30.00 - 0.00) / 0.02 - const timestamp_end = timestamp_begin + total_timestamp_tokens + const total_timestamp_tokens = (30.00 - 0.00) / 0.02; + const timestamp_end = timestamp_begin + total_timestamp_tokens; let previous_tokens = []; let previous_token_timestamps = []; From 5a61999eb990dc685a62aa098ab4a85ed1b5a0ea Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Sat, 7 Dec 2024 20:01:23 +0200 Subject: [PATCH 5/5] Merge conflict resolution --- src/tokenizers.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/tokenizers.js b/src/tokenizers.js index ada58a8fe..e99dde055 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3609,12 +3609,12 @@ export class WhisperTokenizer extends PreTrainedTokenizer { const chunks = []; let chunk = new_chunk(); let time_offset = 0.0; - const timestamp_begin = this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0] + 1; - // Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments. - // We can calculate the last time stamp token as timestamp_begin plus the number of tokens - // tokens from 0.00 to 30.00 which is 1500. - const total_timestamp_tokens = (30.00 - 0.00) / 0.02; - const timestamp_end = timestamp_begin + total_timestamp_tokens; + const timestamp_begin = this.timestamp_begin; + // Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments. + // We can calculate the last time stamp token as timestamp_begin plus the number of tokens + // tokens from 0.00 to 30.00 which is 1500. + const total_timestamp_tokens = 1500; // (30.00 - 0.00) / 0.02 + const timestamp_end = timestamp_begin + total_timestamp_tokens; let previous_tokens = []; let previous_token_timestamps = [];