From 5e223ceb487722476403885c1ce7c1351bb525b9 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Tue, 5 Nov 2024 16:24:23 +0100
Subject: [PATCH] fix pylist (#1673)

* fix pylist

* add comment about why we use PySequence

* style

* fix encode batch fast as well

* Update bindings/python/src/tokenizer.rs

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>

* fix with capacity

* stub :)

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 .../python/py_src/tokenizers/__init__.pyi     |  4 +-
 bindings/python/src/tokenizer.rs              | 54 +++++++++----------
 2 files changed, 30 insertions(+), 28 deletions(-)
diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi
index 6c731ff0a..11e6e556c 100644
--- a/bindings/python/py_src/tokenizers/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/__init__.pyi
@@ -859,7 +859,9 @@ class Tokenizer:
     def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
         """
         Encode the given batch of inputs. This method accept both raw text sequences
-        as well as already pre-tokenized sequences.
+        as well as already pre-tokenized sequences. The reason we use `PySequence` is
+        because it allows type checking with zero-cost (according to PyO3) as we don't
+        have to convert to check.
 
         Example:
             Here are some examples of the inputs that are accepted::
diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
index 7fd03ae89..aa7019f2d 100644
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -995,7 +995,9 @@ impl PyTokenizer {
     }
 
     /// Encode the given batch of inputs. This method accept both raw text sequences
-    /// as well as already pre-tokenized sequences.
+    /// as well as already pre-tokenized sequences. The reason we use `PySequence` is
+    /// because it allows type checking with zero-cost (according to PyO3) as we don't
+    /// have to convert to check.
     ///
     /// Example:
     ///     Here are some examples of the inputs that are accepted::
@@ -1030,25 +1032,24 @@ impl PyTokenizer {
     fn encode_batch(
         &self,
         py: Python<'_>,
-        input: Bound<'_, PyList>,
+        input: Bound<'_, PySequence>,
         is_pretokenized: bool,
         add_special_tokens: bool,
     ) -> PyResult<Vec<PyEncoding>> {
-        let input: Vec<tk::EncodeInput> = input
-            .into_iter()
-            .map(|o| {
-                let input: tk::EncodeInput = if is_pretokenized {
-                    o.extract::<PreTokenizedEncodeInput>()?.into()
-                } else {
-                    o.extract::<TextEncodeInput>()?.into()
-                };
-                Ok(input)
-            })
-            .collect::<PyResult<Vec<tk::EncodeInput>>>()?;
+        let mut items = Vec::<tk::EncodeInput>::with_capacity(input.len()?);
+        for i in 0..input.len()? {
+            let item = input.get_item(i)?;
+            let item: tk::EncodeInput = if is_pretokenized {
+                item.extract::<PreTokenizedEncodeInput>()?.into()
+            } else {
+                item.extract::<TextEncodeInput>()?.into()
+            };
+            items.push(item);
+        }
         py.allow_threads(|| {
             ToPyResult(
                 self.tokenizer
-                    .encode_batch_char_offsets(input, add_special_tokens)
+                    .encode_batch_char_offsets(items, add_special_tokens)
                     .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
             )
             .into()
@@ -1091,25 +1092,24 @@ impl PyTokenizer {
     fn encode_batch_fast(
         &self,
         py: Python<'_>,
-        input: Bound<'_, PyList>,
+        input: Bound<'_, PySequence>,
         is_pretokenized: bool,
         add_special_tokens: bool,
     ) -> PyResult<Vec<PyEncoding>> {
-        let input: Vec<tk::EncodeInput> = input
-            .into_iter()
-            .map(|o| {
-                let input: tk::EncodeInput = if is_pretokenized {
-                    o.extract::<PreTokenizedEncodeInput>()?.into()
-                } else {
-                    o.extract::<TextEncodeInput>()?.into()
-                };
-                Ok(input)
-            })
-            .collect::<PyResult<Vec<tk::EncodeInput>>>()?;
+        let mut items = Vec::<tk::EncodeInput>::with_capacity(input.len()?);
+        for i in 0..input.len()? {
+            let item = input.get_item(i)?;
+            let item: tk::EncodeInput = if is_pretokenized {
+                item.extract::<PreTokenizedEncodeInput>()?.into()
+            } else {
+                item.extract::<TextEncodeInput>()?.into()
+            };
+            items.push(item);
+        }
         py.allow_threads(|| {
             ToPyResult(
                 self.tokenizer
-                    .encode_batch_fast(input, add_special_tokens)
+                    .encode_batch_fast(items, add_special_tokens)
                     .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
             )
             .into()