huggingface · Narsil · Sep 27, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
@@ -641,8 +641,11 @@ def generate_token(
             if i % self.world_size == self.rank:
                 if stop:
                     # Decode generated tokens
-                    output_text = self.decode(
-                        all_input_ids[-stopping_criteria.current_tokens :, 0]
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids[:, 0],
+                        prefix_offset=len(all_input_ids) - stopping_criteria.current_tokens - 1,
+                        read_offset=len(all_input_ids) - stopping_criteria.current_tokens,
+                        skip_special_tokens=True
                     )
                     # Get seed
                     if isinstance(next_token_chooser.choice, Sampling):

diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -793,11 +793,6 @@ def warmup(self, batch: FlashCausalLMBatch):
 
         return int(num_blocks * BLOCK_SIZE)
 
-    def decode(self, generated_ids: Union[torch.Tensor, List[int]]) -> str:
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -1008,8 +1003,11 @@ def generate_token(
             if i % self.world_size == self.rank:
                 if stop:
                     # Decode generated tokens
-                    output_text = self.decode(
-                        all_input_ids[-stopping_criteria.current_tokens :]
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids,
+                        prefix_offset=len(all_input_ids) - stopping_criteria.current_tokens - 1,
+                        read_offset=len(all_input_ids) - stopping_criteria.current_tokens,
+                        skip_special_tokens=True
                     )
                     generated_text = GeneratedText(
                         output_text,

diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py
@@ -611,11 +611,6 @@ def __init__(
     def batch_type(self) -> Type[IdeficsCausalLMBatch]:
         return IdeficsCausalLMBatch
 
-    def decode(self, generated_ids: List[int]) -> str:
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-
     def forward(
         self,
         input_ids,
@@ -728,8 +723,11 @@ def generate_token(
             if i % self.world_size == self.rank:
                 if stop:
                     # Decode generated tokens
-                    output_text = self.decode(
-                        all_input_ids[-stopping_criteria.current_tokens :, 0]
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids[:, 0],
+                        prefix_offset=len(all_input_ids) - stopping_criteria.current_tokens - 1,
+                        read_offset=len(all_input_ids) - stopping_criteria.current_tokens,
+                        skip_special_tokens=True
                     )
                     # Get seed
                     if isinstance(next_token_chooser.choice, Sampling):

diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
@@ -64,16 +64,17 @@ def decode_token(
         all_input_ids: List[int],
         prefix_offset: int = 0,
         read_offset: int = 0,
+        skip_special_tokens: bool = False,
     ) -> Tuple[str, int, int]:
         """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
 
         # The prefix text is necessary only to defeat cleanup algorithms in the decode
         # which decide to add a space or not depending on the surrounding ids.
         prefix_text = self.tokenizer.decode(
-            all_input_ids[prefix_offset:read_offset], skip_special_tokens=False
+            all_input_ids[prefix_offset:read_offset], skip_special_tokens=skip_special_tokens
         )
         new_text = self.tokenizer.decode(
-            all_input_ids[prefix_offset:], skip_special_tokens=False
+            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
         )
 
         if len(new_text) > len(prefix_text) and not new_text.endswith("�"):

diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
@@ -710,8 +710,11 @@ def generate_token(
                 if stop:
                     # Slice with decoder_input_length to remove padding
                     # Decode all tokens
-                    output_text = self.decode(
-                        all_decoder_input_ids[-decoder_input_length:]
+                    output_text, _, _ = self.decode_token(
+                        all_decoder_input_ids,
+                        prefix_offset=len(all_decoder_input_ids) - decoder_input_length - 1,
+                        read_offset=len(all_decoder_input_ids) - decoder_input_length,
+                        skip_special_tokens=True
                     )
 
                     # Get seed