Make it non breaking and still show perf improvement.

huggingface · Aug 8, 2024 · 8ee092a · 8ee092a
1 parent 913c389
commit 8ee092a
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 4 deletions.
diff --git a/bindings/python/benches/test_tiktoken.py b/bindings/python/benches/test_tiktoken.py
@@ -73,7 +73,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
     hf_enc.encode("warmup")
 
     start = time.perf_counter_ns()
-    hf_enc.encode_batch(documents)
+    hf_enc.encode_batch_fast(documents)
     end = time.perf_counter_ns()
     readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
     print(f"huggingface \t{readable_size} / s")

diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi
@@ -892,6 +892,44 @@ class Tokenizer:
         """
         pass
 
+    def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
+        """
+        Encode the given batch of inputs. This method is faster than `encode_batch`
+        because it doesn't keep track of offsets.
+        Note: Currently the offsets are tracked in bytes instead of characters
+        But this may evolve in future releases to speed things even further.
+
+        Example:
+            Here are some examples of the inputs that are accepted::
+
+                encode_batch_fast([
+                    "A single sequence",
+                    ("A tuple with a sequence", "And its pair"),
+                    [ "A", "pre", "tokenized", "sequence" ],
+                    ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+                ])
+
+        Args:
+            input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+                A list of single sequences or pair sequences to encode. Each sequence
+                can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+                argument:
+
+                - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+                - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized
+
+            add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+                Whether to add the special tokens
+
+        Returns:
+            A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+
+        """
+        pass
+
     @property
     def encode_special_tokens(self):
         """

diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
@@ -1048,16 +1048,49 @@ impl PyTokenizer {
         py.allow_threads(|| {
             ToPyResult(
                 self.tokenizer
-                    .encode_batch(input, add_special_tokens)
+                    .encode_batch_char_offsets(input, add_special_tokens)
                     .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
             )
             .into()
         })
     }
 
+    /// Encode the given batch of inputs. This method is faster than `encode_batch`
+    /// because it doesn't keep track of offsets.
+    /// Note: Currently the offsets are tracked in bytes instead of characters
+    /// But this may evolve in future releases to speed things even further.
+    ///
+    /// Example:
+    ///     Here are some examples of the inputs that are accepted::
+    ///
+    ///         encode_batch_fast([
+    ///             "A single sequence",
+    ///             ("A tuple with a sequence", "And its pair"),
+    ///             [ "A", "pre", "tokenized", "sequence" ],
+    ///             ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
+    ///         ])
+    ///
+    /// Args:
+    ///     input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
+    ///         A list of single sequences or pair sequences to encode. Each sequence
+    ///         can be either raw text or pre-tokenized, according to the ``is_pretokenized``
+    ///         argument:
+    ///
+    ///         - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
+    ///         - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
+    ///
+    ///     is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+    ///         Whether the input is already pre-tokenized
+    ///
+    ///     add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
+    ///         Whether to add the special tokens
+    ///
+    /// Returns:
+    ///     A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
+    ///
     #[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
     #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
-    fn encode_batch_char_offsets(
+    fn encode_batch_fast(
         &self,
         py: Python<'_>,
         input: Vec<&PyAny>,
@@ -1078,7 +1111,7 @@ impl PyTokenizer {
         py.allow_threads(|| {
             ToPyResult(
                 self.tokenizer
-                    .encode_batch_char_offsets(input, add_special_tokens)
+                    .encode_batch(input, add_special_tokens)
                     .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
             )
             .into()