From 391cd450ded9038c06099906a667f4d9a6e6ed3e Mon Sep 17 00:00:00 2001
From: Robin Bakker <robin-1194@hotmail.com>
Date: Mon, 24 Jun 2024 14:27:54 +0200
Subject: [PATCH 1/5] add language to words

_collate_word_timestamps uses the return_language flag to determine whether the language of the chunk should be added to the word's information
---
 .../models/whisper/tokenization_whisper.py             | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 303822de65f8b0..c5015f3b6e934a 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -1013,7 +1013,7 @@ def new_chunk():
                         chunk["text"] = resolved_text
                         if return_timestamps == "word":
                             chunk["words"] = _collate_word_timestamps(
-                                tokenizer, resolved_tokens, resolved_token_timestamps, last_language
+                                tokenizer, resolved_tokens, resolved_token_timestamps, last_language, return_language
                             )
                         chunks.append(chunk)
 
@@ -1065,7 +1065,7 @@ def new_chunk():
         chunk["text"] = resolved_text
         if return_timestamps == "word":
             chunk["words"] = _collate_word_timestamps(
-                tokenizer, resolved_tokens, resolved_token_timestamps, last_language
+                tokenizer, resolved_tokens, resolved_token_timestamps, last_language, return_language
             )
         chunks.append(chunk)
 
@@ -1197,12 +1197,16 @@ def _find_longest_common_sequence(sequences, token_timestamp_sequences=None):
         return total_sequence, []
 
 
-def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language):
+def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language, return_language):
     words, _, token_indices = _combine_tokens_into_words(tokenizer, tokens, language)
+
+    optional_language_field = {"language": language} if return_language else {}
+
     timings = [
         {
             "text": word,
             "timestamp": (token_timestamps[indices[0]][0], token_timestamps[indices[-1]][1]),
+            **optional_language_field
         }
         for word, indices in zip(words, token_indices)
     ]

From 549c0f1fb534c73c1d0bbd53080d7fc20c5a625f Mon Sep 17 00:00:00 2001
From: Robin Bakker <robin-1194@hotmail.com>
Date: Mon, 24 Jun 2024 15:40:28 +0200
Subject: [PATCH 2/5] ran style checks

added missing comma
---
 src/transformers/models/whisper/tokenization_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index c5015f3b6e934a..da3d83ac61387f 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -1206,7 +1206,7 @@ def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language, retu
         {
             "text": word,
             "timestamp": (token_timestamps[indices[0]][0], token_timestamps[indices[-1]][1]),
-            **optional_language_field
+            **optional_language_field,
         }
         for word, indices in zip(words, token_indices)
     ]

From 0352e8ea117eecd08962443a25209f4410664e55 Mon Sep 17 00:00:00 2001
From: Robin Bakker <robin-1194@hotmail.com>
Date: Tue, 2 Jul 2024 23:26:49 +0200
Subject: [PATCH 3/5] add new language test

test that the pipeline can return both the language and timestamp
---
 ..._pipelines_automatic_speech_recognition.py | 58 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 73376ff2189c09..994bc8cb1d797c 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -324,7 +324,6 @@ def test_torch_large_with_input_features(self):
 
     @slow
     @require_torch
-    @slow
     def test_return_timestamps_in_preprocess(self):
         pipe = pipeline(
             task="automatic-speech-recognition",
@@ -368,6 +367,63 @@ def test_return_timestamps_in_preprocess(self):
         )
         # fmt: on
 
+    @slow
+    @require_torch
+    def test_return_timestamps_and_language_in_preprocess(self):
+        pipe = pipeline(
+            task="automatic-speech-recognition",
+            model="openai/whisper-tiny",
+            chunk_length_s=8,
+            stride_length_s=1,
+            return_language=True,
+        )
+        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
+        sample = next(iter(data))
+        pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="en", task="transcribe")
+
+        res = pipe(sample["audio"]["array"])
+        self.assertEqual(
+            res,
+            {
+                "text": " Conquered returned to its place amidst the tents.",
+                "chunks": [{"language": "english", "text": " Conquered returned to its place amidst the tents."}],
+            },
+        )
+        res = pipe(sample["audio"]["array"], return_timestamps=True)
+        self.assertEqual(
+            res,
+            {
+                "text": " Conquered returned to its place amidst the tents.",
+                "chunks": [
+                    {
+                        "timestamp": (0.0, 3.36),
+                        "language": "english",
+                        "text": " Conquered returned to its place amidst the tents.",
+                    }
+                ],
+            },
+        )
+        pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
+        res = pipe(sample["audio"]["array"], return_timestamps="word")
+        # fmt: off
+        self.assertEqual(
+            res,
+            {
+                'text': ' Conquered returned to its place amidst the tents.',
+                'chunks': [
+                    {"language": "english",'text': ' Conquered', 'timestamp': (0.5, 1.2)},
+                    {"language": "english", 'text': ' returned', 'timestamp': (1.2, 1.64)},
+                    {"language": "english",'text': ' to', 'timestamp': (1.64, 1.84)},
+                    {"language": "english",'text': ' its', 'timestamp': (1.84, 2.02)},
+                    {"language": "english",'text': ' place', 'timestamp': (2.02, 2.28)},
+                    {"language": "english",'text': ' amidst', 'timestamp': (2.28, 2.8)},
+                    {"language": "english",'text': ' the', 'timestamp': (2.8, 2.98)},
+                    {"language": "english",'text': ' tents.', 'timestamp': (2.98, 3.48)},
+                ],
+            },
+        )
+        # fmt: on
+
     @slow
     @require_torch
     def test_return_timestamps_in_preprocess_longform(self):

From e09de69312f8b918a894a489467f74119680ee1e Mon Sep 17 00:00:00 2001
From: Robin Bakker <robin-1194@hotmail.com>
Date: Tue, 16 Jul 2024 20:07:16 +0200
Subject: [PATCH 4/5] remove model configuration in test

Removed model configurations that do not influence test results
---
 .../pipelines/test_pipelines_automatic_speech_recognition.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 994bc8cb1d797c..ca0d70005aafef 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -333,10 +333,10 @@ def test_return_timestamps_in_preprocess(self):
         )
         data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
         sample = next(iter(data))
-        pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="en", task="transcribe")
 
         res = pipe(sample["audio"]["array"])
         self.assertEqual(res, {"text": " Conquered returned to its place amidst the tents."})
+
         res = pipe(sample["audio"]["array"], return_timestamps=True)
         self.assertEqual(
             res,
@@ -345,9 +345,8 @@ def test_return_timestamps_in_preprocess(self):
                 "chunks": [{"timestamp": (0.0, 3.36), "text": " Conquered returned to its place amidst the tents."}],
             },
         )
-        pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
-        res = pipe(sample["audio"]["array"], return_timestamps="word")
 
+        res = pipe(sample["audio"]["array"], return_timestamps="word")
         # fmt: off
         self.assertEqual(
             res,

From 270dc94b9927a5e2830b6d2456425afadb8f57cc Mon Sep 17 00:00:00 2001
From: Robin Bakker <robin-1194@hotmail.com>
Date: Tue, 16 Jul 2024 20:11:39 +0200
Subject: [PATCH 5/5] remove model configuration in test

Removed model configurations that do not influence test results
---
 .../pipelines/test_pipelines_automatic_speech_recognition.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index ca0d70005aafef..581e071c53d99f 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -378,7 +378,6 @@ def test_return_timestamps_and_language_in_preprocess(self):
         )
         data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
         sample = next(iter(data))
-        pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language="en", task="transcribe")
 
         res = pipe(sample["audio"]["array"])
         self.assertEqual(
@@ -388,6 +387,7 @@ def test_return_timestamps_and_language_in_preprocess(self):
                 "chunks": [{"language": "english", "text": " Conquered returned to its place amidst the tents."}],
             },
         )
+
         res = pipe(sample["audio"]["array"], return_timestamps=True)
         self.assertEqual(
             res,
@@ -402,7 +402,7 @@ def test_return_timestamps_and_language_in_preprocess(self):
                 ],
             },
         )
-        pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
+
         res = pipe(sample["audio"]["array"], return_timestamps="word")
         # fmt: off
         self.assertEqual(