Skip to content

Commit

Permalink
Make it non breaking and still show perf improvement.
Browse files Browse the repository at this point in the history
  • Loading branch information
Narsil committed Aug 8, 2024
1 parent 913c389 commit 8ee092a
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 4 deletions.
2 changes: 1 addition & 1 deletion bindings/python/benches/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
hf_enc.encode("warmup")

start = time.perf_counter_ns()
hf_enc.encode_batch(documents)
hf_enc.encode_batch_fast(documents)
end = time.perf_counter_ns()
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
print(f"huggingface \t{readable_size} / s")
Expand Down
38 changes: 38 additions & 0 deletions bindings/python/py_src/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,44 @@ class Tokenizer:
"""
pass

def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
"""
Encode the given batch of inputs. This method is faster than `encode_batch`
because it doesn't keep track of offsets.
Note: Currently the offsets are tracked in bytes instead of characters
But this may evolve in future releases to speed things even further.
Example:
Here are some examples of the inputs that are accepted::
encode_batch_fast([
"A single sequence",
("A tuple with a sequence", "And its pair"),
[ "A", "pre", "tokenized", "sequence" ],
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
])
Args:
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
A list of single sequences or pair sequences to encode. Each sequence
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
argument:
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Whether the input is already pre-tokenized
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add the special tokens
Returns:
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
"""
pass

@property
def encode_special_tokens(self):
"""
Expand Down
39 changes: 36 additions & 3 deletions bindings/python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1048,16 +1048,49 @@ impl PyTokenizer {
py.allow_threads(|| {
ToPyResult(
self.tokenizer
.encode_batch(input, add_special_tokens)
.encode_batch_char_offsets(input, add_special_tokens)
.map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
)
.into()
})
}

/// Encode the given batch of inputs. This method is faster than `encode_batch`
/// because it doesn't keep track of offsets.
/// Note: Currently the offsets are tracked in bytes instead of characters
/// But this may evolve in future releases to speed things even further.
///
/// Example:
/// Here are some examples of the inputs that are accepted::
///
/// encode_batch_fast([
/// "A single sequence",
/// ("A tuple with a sequence", "And its pair"),
/// [ "A", "pre", "tokenized", "sequence" ],
/// ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
/// ])
///
/// Args:
/// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
/// A list of single sequences or pair sequences to encode. Each sequence
/// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
/// argument:
///
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
///
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
/// Whether the input is already pre-tokenized
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add the special tokens
///
/// Returns:
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
///
#[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
#[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
fn encode_batch_char_offsets(
fn encode_batch_fast(
&self,
py: Python<'_>,
input: Vec<&PyAny>,
Expand All @@ -1078,7 +1111,7 @@ impl PyTokenizer {
py.allow_threads(|| {
ToPyResult(
self.tokenizer
.encode_batch_char_offsets(input, add_special_tokens)
.encode_batch(input, add_special_tokens)
.map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
)
.into()
Expand Down

0 comments on commit 8ee092a

Please sign in to comment.