Fix encode_batch and encode_batch_fast to accept ndarrays again (#1679)

* Fix encode_batch and encode_batch_fast to accept ndarrays again * Fix clippy --------- Co-authored-by: Dimitris Iliopoulos <[email protected]>
huggingface · Nov 26, 2024 · 4bc7b4c · 4bc7b4c
1 parent a1c572e
commit 4bc7b4c
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 16 deletions.
diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
@@ -408,10 +408,10 @@ impl<'s> FromPyObject<'s> for TextEncodeInput<'s> {
         if let Ok((i1, i2)) = ob.extract::<(TextInputSequence, TextInputSequence)>() {
             return Ok(Self((i1, i2).into()));
         }
-        if let Ok(arr) = ob.downcast::<PyList>() {
+        if let Ok(arr) = ob.extract::<Vec<Bound<PyAny>>>() {
             if arr.len() == 2 {
-                let first = arr.get_item(0)?.extract::<TextInputSequence>()?;
-                let second = arr.get_item(1)?.extract::<TextInputSequence>()?;
+                let first = arr[0].extract::<TextInputSequence>()?;
+                let second = arr[1].extract::<TextInputSequence>()?;
                 return Ok(Self((first, second).into()));
             }
         }
@@ -435,10 +435,10 @@ impl<'s> FromPyObject<'s> for PreTokenizedEncodeInput<'s> {
         {
             return Ok(Self((i1, i2).into()));
         }
-        if let Ok(arr) = ob.downcast::<PyList>() {
+        if let Ok(arr) = ob.extract::<Vec<Bound<PyAny>>>() {
             if arr.len() == 2 {
-                let first = arr.get_item(0)?.extract::<PreTokenizedInputSequence>()?;
-                let second = arr.get_item(1)?.extract::<PreTokenizedInputSequence>()?;
+                let first = arr[0].extract::<PreTokenizedInputSequence>()?;
+                let second = arr[1].extract::<PreTokenizedInputSequence>()?;
                 return Ok(Self((first, second).into()));
             }
         }
@@ -1033,13 +1033,12 @@ impl PyTokenizer {
     fn encode_batch(
         &self,
         py: Python<'_>,
-        input: Bound<'_, PySequence>,
+        input: Vec<Bound<'_, PyAny>>,
         is_pretokenized: bool,
         add_special_tokens: bool,
     ) -> PyResult<Vec<PyEncoding>> {
-        let mut items = Vec::<tk::EncodeInput>::with_capacity(input.len()?);
-        for i in 0..input.len()? {
-            let item = input.get_item(i)?;
+        let mut items = Vec::<tk::EncodeInput>::with_capacity(input.len());
+        for item in &input {
             let item: tk::EncodeInput = if is_pretokenized {
                 item.extract::<PreTokenizedEncodeInput>()?.into()
             } else {
@@ -1093,13 +1092,12 @@ impl PyTokenizer {
     fn encode_batch_fast(
         &self,
         py: Python<'_>,
-        input: Bound<'_, PySequence>,
+        input: Vec<Bound<'_, PyAny>>,
         is_pretokenized: bool,
         add_special_tokens: bool,
     ) -> PyResult<Vec<PyEncoding>> {
-        let mut items = Vec::<tk::EncodeInput>::with_capacity(input.len()?);
-        for i in 0..input.len()? {
-            let item = input.get_item(i)?;
+        let mut items = Vec::<tk::EncodeInput>::with_capacity(input.len());
+        for item in &input {
             let item: tk::EncodeInput = if is_pretokenized {
                 item.extract::<PreTokenizedEncodeInput>()?.into()
             } else {

diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
@@ -153,8 +153,6 @@ def test_encode(self):
         assert len(output) == 2
 
     def test_encode_formats(self, bert_files):
-        print("Broken by the change from std::usize::Max to usixeMax")
-        return 0
         with pytest.deprecated_call():
             tokenizer = BertWordPieceTokenizer(bert_files["vocab"])