From 5e223ceb487722476403885c1ce7c1351bb525b9 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 5 Nov 2024 16:24:23 +0100 Subject: [PATCH] fix pylist (#1673) * fix pylist * add comment about why we use PySequence * style * fix encode batch fast as well * Update bindings/python/src/tokenizer.rs Co-authored-by: Nicolas Patry * fix with capacity * stub :) --------- Co-authored-by: Nicolas Patry --- .../python/py_src/tokenizers/__init__.pyi | 4 +- bindings/python/src/tokenizer.rs | 54 +++++++++---------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index 6c731ff0a..11e6e556c 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -859,7 +859,9 @@ class Tokenizer: def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True): """ Encode the given batch of inputs. This method accept both raw text sequences - as well as already pre-tokenized sequences. + as well as already pre-tokenized sequences. The reason we use `PySequence` is + because it allows type checking with zero-cost (according to PyO3) as we don't + have to convert to check. Example: Here are some examples of the inputs that are accepted:: diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 7fd03ae89..aa7019f2d 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -995,7 +995,9 @@ impl PyTokenizer { } /// Encode the given batch of inputs. This method accept both raw text sequences - /// as well as already pre-tokenized sequences. + /// as well as already pre-tokenized sequences. The reason we use `PySequence` is + /// because it allows type checking with zero-cost (according to PyO3) as we don't + /// have to convert to check. /// /// Example: /// Here are some examples of the inputs that are accepted:: @@ -1030,25 +1032,24 @@ impl PyTokenizer { fn encode_batch( &self, py: Python<'_>, - input: Bound<'_, PyList>, + input: Bound<'_, PySequence>, is_pretokenized: bool, add_special_tokens: bool, ) -> PyResult> { - let input: Vec = input - .into_iter() - .map(|o| { - let input: tk::EncodeInput = if is_pretokenized { - o.extract::()?.into() - } else { - o.extract::()?.into() - }; - Ok(input) - }) - .collect::>>()?; + let mut items = Vec::::with_capacity(input.len()?); + for i in 0..input.len()? { + let item = input.get_item(i)?; + let item: tk::EncodeInput = if is_pretokenized { + item.extract::()?.into() + } else { + item.extract::()?.into() + }; + items.push(item); + } py.allow_threads(|| { ToPyResult( self.tokenizer - .encode_batch_char_offsets(input, add_special_tokens) + .encode_batch_char_offsets(items, add_special_tokens) .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()), ) .into() @@ -1091,25 +1092,24 @@ impl PyTokenizer { fn encode_batch_fast( &self, py: Python<'_>, - input: Bound<'_, PyList>, + input: Bound<'_, PySequence>, is_pretokenized: bool, add_special_tokens: bool, ) -> PyResult> { - let input: Vec = input - .into_iter() - .map(|o| { - let input: tk::EncodeInput = if is_pretokenized { - o.extract::()?.into() - } else { - o.extract::()?.into() - }; - Ok(input) - }) - .collect::>>()?; + let mut items = Vec::::with_capacity(input.len()?); + for i in 0..input.len()? { + let item = input.get_item(i)?; + let item: tk::EncodeInput = if is_pretokenized { + item.extract::()?.into() + } else { + item.extract::()?.into() + }; + items.push(item); + } py.allow_threads(|| { ToPyResult( self.tokenizer - .encode_batch_fast(input, add_special_tokens) + .encode_batch_fast(items, add_special_tokens) .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()), ) .into()