huggingface · patrickvonplaten · Jan 19, 2024 · Nov 22, 2023 · Nov 27, 2023 · Nov 27, 2023
@@ -95,6 +95,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
                 scores = processor(input_ids, scores, **kwargs)
             else:
                 scores = processor(input_ids, scores)
+
         return scores
 
 
@@ -1657,6 +1658,9 @@ def __init__(self, begin_suppress_tokens, begin_index):
         self.begin_suppress_tokens = list(begin_suppress_tokens)
         self.begin_index = begin_index
 
+    def set_begin_index(self, begin_index):
+        self.begin_index = begin_index
+
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if input_ids.shape[1] == self.begin_index:
@@ -1778,6 +1782,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
                 max_initial_timestamp_index (`int`, *optional*, defaults to 1):
                     Used to set the maximum value of the initial timestamp. This is used to prevent the model from
                     predicting timestamps that are too far in the future.
+        begin_index (`Optional`, *optional*): Token index of the first token that is generated by the model.
         _detect_timestamp_from_logprob (`bool`, *optional*): Whether timestamps can be predicted from logprobs over all timestamps.
 
     Examples:
@@ -1810,11 +1815,11 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
     """
 
     def __init__(
-        self, generate_config, _detect_timestamp_from_logprob: Optional[bool] = None
+        self, generate_config, begin_index: Optional[int] = None, _detect_timestamp_from_logprob: Optional[bool] = None
     ):  # support for the kwargs
-        self.eos_token_id = generate_config.eos_token_id
         self.no_timestamps_token_id = generate_config.no_timestamps_token_id
         self.timestamp_begin = generate_config.no_timestamps_token_id + 1
+        self.eos_token_id = generate_config.eos_token_id or generate_config.bos_token_id
 
         # this variable is mostly just used for testing
         self._detect_timestamp_from_logprob = (
@@ -1823,10 +1828,17 @@ def __init__(
             else getattr(generate_config, "_detect_timestamp_from_logprob", True)
         )
 
-        self.begin_index = (
-            len(generate_config.forced_decoder_ids) + 1 if generate_config.forced_decoder_ids is not None else 1
+        num_forced_ids = (
+            len(generate_config.forced_decoder_ids) if generate_config.forced_decoder_ids is not None else 0
         )
+        self.begin_index = begin_index or (num_forced_ids + 1)
+
         self.max_initial_timestamp_index = getattr(generate_config, "max_initial_timestamp_index", None)
+        # TODO(Patrick): Make sure that official models have max_initial_timestamp_index set to 50
+        # self.max_initial_timestamp_index = 50
+
+    def set_begin_index(self, begin_index):
+        self.begin_index = begin_index
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
@@ -1878,6 +1890,63 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         return scores
 
 
+class WhisperNoSpeechDetection(LogitsProcessor):
+    r"""This processor can be used to detect silence when using Whisper."""
+
+    def __init__(self, no_speech_token: int, begin_index: int, scores_is_logprobs: bool = False):
+        self.no_speech_token = no_speech_token
+        # offset between <start-of-transcription> token, <SOT>, in paper and first generated token
+        # is equal to the position of the first generated token index
+        self.start_of_trans_offset = begin_index
+
+        # `self.begin_index` is a running value that is changed on the fly
+        self.begin_index = begin_index
+        self._no_speech_prob = [0.0]
+        self.is_scores_logprobs = scores_is_logprobs
+
+        # make sure we pass all logits
+        self._pass_all_logits = True
+
+        # overwritten dynamically
+        self.model = None
+        self.inputs = None
+
+    def set_model(self, model):
+        self.model = model
+
+    def set_inputs(self, inputs):
+        self.inputs = {**self.model.prepare_inputs_for_generation(**inputs), **inputs}
+        self.inputs["input_features"] = self.inputs.pop("inputs")
+
+    @property
+    def no_speech_prob(self):
+        return self._no_speech_prob
+
+    def set_begin_index(self, begin_index):
+        self.begin_index = begin_index
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if input_ids.shape[1] == self.begin_index:
+            if self.start_of_trans_offset > 1:
+                with torch.no_grad():
+                    logits = self.model(**self.inputs).logits
+
+                no_speech_index = self.begin_index - self.start_of_trans_offset
+                no_speech_scores = logits[:, no_speech_index]
+            else:
+                no_speech_scores = scores
+
+            if self.is_scores_logprobs:
+                probs = no_speech_scores.exp()
+            else:
+                probs = no_speech_scores.float().softmax(dim=-1)
+
+            self._no_speech_prob = probs[:, self.no_speech_token]
+
+        return scores
+
+
 class ClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
     r"""
     [`LogitsProcessor`] for classifier free guidance (CFG). The scores are split over the batch dimension,

@@ -518,6 +518,8 @@ def _prepare_decoder_input_ids_for_generation(
         # exception: Donut checkpoints have task-specific decoder starts and don't expect a BOS token
         elif self.config.model_type == "vision-encoder-decoder" and "donut" in self.name_or_path.lower():
             pass
+        elif self.config.model_type in ["whisper"]:
+            pass
         # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust
         # decoder_attention_mask if provided)
         elif (decoder_input_ids[:, 0] != decoder_start_token_id).all().item():