From bfd9cdeefb6fb2fb9b5514f8a5fad6d7263a69d6 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 8 Aug 2024 14:56:13 +0200 Subject: [PATCH] Perf improvement 16% by removing offsets. (#1587) * [Breaking Change] Perf improvement 16% by removing offsets. Offsets calculation are always calculated in Python land. By changing it to not being calculated, we win 16% of the runtime. This is not the total extent of it because offsets are still calculated in bytes. * Required features. * Remove clippy error. * Make it non breaking and still show perf improvement. * Even faster without offsets. * Update doc. * Fmt. * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fmt. --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- bindings/python/benches/test_tiktoken.py | 14 ++-- .../python/py_src/tokenizers/__init__.pyi | 36 +++++++++++ bindings/python/src/tokenizer.rs | 61 ++++++++++++++++++ tokenizers/Cargo.toml | 10 +++ tokenizers/benches/llama3.rs | 42 ++++++++++++ tokenizers/examples/encode_batch.rs | 11 ++++ tokenizers/src/tokenizer/mod.rs | 64 +++++++++++++++++++ tokenizers/src/tokenizer/pre_tokenizer.rs | 15 +++++ 8 files changed, 247 insertions(+), 6 deletions(-) create mode 100644 tokenizers/benches/llama3.rs create mode 100644 tokenizers/examples/encode_batch.rs diff --git a/bindings/python/benches/test_tiktoken.py b/bindings/python/benches/test_tiktoken.py index 21a17c2d3..3fdad5daf 100644 --- a/bindings/python/benches/test_tiktoken.py +++ b/bindings/python/benches/test_tiktoken.py @@ -60,7 +60,12 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document mergeable_ranks=mergeable_ranks, special_tokens=special_tokens, ) - enc.encode("warmup") + out = enc.encode("This is a test") + + hf_enc = Tokenizer.from_pretrained(model) + out2 = hf_enc.encode("This is a test", add_special_tokens=False).ids + + assert out == out2, "sanity check" start = time.perf_counter_ns() enc.encode_ordinary_batch(documents, num_threads=num_threads) @@ -69,11 +74,9 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9) print(f"tiktoken \t{readable_size} / s") - hf_enc = Tokenizer.from_pretrained(model) - hf_enc.encode("warmup") start = time.perf_counter_ns() - hf_enc.encode_batch(documents) + hf_enc.encode_batch_fast(documents) end = time.perf_counter_ns() readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9) print(f"huggingface \t{readable_size} / s") @@ -82,8 +85,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document def test(model: str, dataset: str, dataset_config: str, threads: List[int]): dataset_xnli = load_dataset(dataset, dataset_config) - # input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths - input_lengths = [(10_000, False, True), (10_000, False, False)] + input_lengths = [(10, False, True), (10_000, False, True), (10_000, False, False)] for num_threads in threads: for length, fuse, long in input_lengths: diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index 5dbc665dc..0ad96fc8a 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -892,6 +892,42 @@ class Tokenizer: """ pass + def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True): + """ + Encode the given batch of inputs. This method is faster than `encode_batch` + because it doesn't keep track of offsets, they will be all zeros. + + Example: + Here are some examples of the inputs that are accepted:: + + encode_batch_fast([ + "A single sequence", + ("A tuple with a sequence", "And its pair"), + [ "A", "pre", "tokenized", "sequence" ], + ([ "A", "pre", "tokenized", "sequence" ], "And its pair") + ]) + + Args: + input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`): + A list of single sequences or pair sequences to encode. Each sequence + can be either raw text or pre-tokenized, according to the ``is_pretokenized`` + argument: + + - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput` + - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput` + + is_pretokenized (:obj:`bool`, defaults to :obj:`False`): + Whether the input is already pre-tokenized + + add_special_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether to add the special tokens + + Returns: + A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch + + """ + pass + @property def encode_special_tokens(self): """ diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index c967f74ff..24a68c6bb 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -1055,6 +1055,67 @@ impl PyTokenizer { }) } + /// Encode the given batch of inputs. This method is faster than `encode_batch` + /// because it doesn't keep track of offsets, they will be all zeros. + /// + /// Example: + /// Here are some examples of the inputs that are accepted:: + /// + /// encode_batch_fast([ + /// "A single sequence", + /// ("A tuple with a sequence", "And its pair"), + /// [ "A", "pre", "tokenized", "sequence" ], + /// ([ "A", "pre", "tokenized", "sequence" ], "And its pair") + /// ]) + /// + /// Args: + /// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`): + /// A list of single sequences or pair sequences to encode. Each sequence + /// can be either raw text or pre-tokenized, according to the ``is_pretokenized`` + /// argument: + /// + /// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput` + /// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput` + /// + /// is_pretokenized (:obj:`bool`, defaults to :obj:`False`): + /// Whether the input is already pre-tokenized + /// + /// add_special_tokens (:obj:`bool`, defaults to :obj:`True`): + /// Whether to add the special tokens + /// + /// Returns: + /// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch + /// + #[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))] + #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")] + fn encode_batch_fast( + &self, + py: Python<'_>, + input: Vec<&PyAny>, + is_pretokenized: bool, + add_special_tokens: bool, + ) -> PyResult> { + let input: Vec = input + .into_iter() + .map(|o| { + let input: tk::EncodeInput = if is_pretokenized { + o.extract::()?.into() + } else { + o.extract::()?.into() + }; + Ok(input) + }) + .collect::>>()?; + py.allow_threads(|| { + ToPyResult( + self.tokenizer + .encode_batch_fast(input, add_special_tokens) + .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()), + ) + .into() + }) + } + /// Decode the given list of ids back to a string /// /// This is used to decode anything coming back from a Language Model diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 6e7e8ad54..2184997ae 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -36,6 +36,11 @@ harness = false name = "unigram_benchmark" harness = false +[[bench]] +name = "llama3" +required-features = ["http"] +harness = false + [dependencies] lazy_static = "1.4" rand = "0.8" @@ -80,3 +85,8 @@ tracing-subscriber = "0.3.18" [profile.release] lto = "fat" + +[[example]] +name = "encode_batch" +required-features = ["http"] + diff --git a/tokenizers/benches/llama3.rs b/tokenizers/benches/llama3.rs new file mode 100644 index 000000000..77af3bd63 --- /dev/null +++ b/tokenizers/benches/llama3.rs @@ -0,0 +1,42 @@ +#[macro_use] +extern crate criterion; + +use criterion::{Criterion, Throughput}; +use tokenizers::Tokenizer; + +pub fn llama3(c: &mut Criterion) { + let data = std::fs::read_to_string("data/big.txt").unwrap(); + let mut group = c.benchmark_group("llama3-encode"); + group.throughput(Throughput::Bytes(data.bytes().len() as u64)); + group.bench_function("llama3-offsets", |b| { + let tokenizer = + Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap(); + let data: Vec<_> = data.lines().collect(); + let add_special_tokens = false; + b.iter(|| { + tokenizer + .encode_batch_char_offsets(criterion::black_box(data.clone()), add_special_tokens) + .unwrap() + }) + }); + group.bench_function("llama3-nooffsets", |b| { + let tokenizer = + Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap(); + let data: Vec<_> = data.lines().collect(); + let add_special_tokens = false; + b.iter(|| { + tokenizer + .encode_batch(criterion::black_box(data.clone()), add_special_tokens) + .unwrap() + }) + }); + group.finish(); +} + +criterion_group! { + name = bert_benches; + config = Criterion::default().sample_size(10); + targets = llama3 +} + +criterion_main!(bert_benches); diff --git a/tokenizers/examples/encode_batch.rs b/tokenizers/examples/encode_batch.rs new file mode 100644 index 000000000..3167f3fd3 --- /dev/null +++ b/tokenizers/examples/encode_batch.rs @@ -0,0 +1,11 @@ +use tokenizers::Tokenizer; + +fn main() -> Result<(), Box> { + let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?; + + let data = std::fs::read_to_string("data/big.txt")?; + let data: Vec<_> = data.lines().collect(); + let add_special_tokens = false; + tokenizer.encode_batch_char_offsets(data, add_special_tokens)?; + Ok(()) +} diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 1c2ad6e0b..d92e04a1d 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -759,6 +759,48 @@ where } } + /// Encode the given input. This method accepts both single sequences, as well as pair + /// sequences. Also, a sequence can be a string, or already pre-tokenized input directly: + /// Contrarily to `encode`, it does not compute offsets + /// ``` + /// # use tokenizers::Tokenizer; + /// # use tokenizers::models::bpe::BPE; + /// # let mut tokenizer = Tokenizer::new(BPE::default()); + /// # + /// // Sequences: + /// tokenizer.encode_fast("Single sequence", false); + /// tokenizer.encode_fast(("Sequence A", "Sequence B"), false); + /// + /// // Pre-tokenized sequences: + /// tokenizer.encode_fast(&["Single", "sequence"][..], false); + /// tokenizer.encode_fast(( + /// &["Sequence", "A"][..], + /// &["Sequence", "B"][..] + /// ), false); + /// + /// // or even both types together: + /// tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false); + /// ``` + pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result + where + E: Into>, + { + // Extract sequences from the EncodeInput + let (sequence, pair) = match input.into() { + EncodeInput::Single(s1) => (s1, None), + EncodeInput::Dual(s1, s2) => (s1, Some(s2)), + }; + + // Encode each sequence + let encoding = self.encode_single_sequence(sequence, 0, OffsetType::None)?; + let pair_encoding = pair + .map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::None)) + .transpose()?; + + // And finally post process + self.post_process(encoding, pair_encoding, add_special_tokens) + } + /// Encode the given input. This method accepts both single sequences, as well as pair /// sequences. Also, a sequence can be a string, or already pre-tokenized input directly: /// @@ -1059,6 +1101,28 @@ where Ok(encodings) } + /// Encode all the sentences in parallel, using multiple threads + pub fn encode_batch_fast<'s, E>( + &self, + inputs: Vec, + add_special_tokens: bool, + ) -> Result> + where + E: Into> + Send, + { + let mut encodings = inputs + .into_maybe_par_iter() + .map(|input| self.encode_fast(input, add_special_tokens)) + .collect::>>()?; + + if let Some(params) = &self.padding { + // We do the padding here to make sure we handle the batch padding + pad_encodings(&mut encodings, params)?; + } + + Ok(encodings) + } + /// Decode all sentences in parallel pub fn decode_batch( &self, diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs index 9667c240a..0d54cd62b 100644 --- a/tokenizers/src/tokenizer/pre_tokenizer.rs +++ b/tokenizers/src/tokenizer/pre_tokenizer.rs @@ -8,6 +8,7 @@ use std::collections::HashMap; pub enum OffsetType { Byte, Char, + None, } /// Wrapper for a subpart of a `NormalizedString`. @@ -146,6 +147,19 @@ impl PreTokenizedString { let offset_converter = match offset_type { OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)), OffsetType::Byte => None, + OffsetType::None => { + let tokens = self + .splits + .into_iter() + .flat_map(|split| { + split.tokens.unwrap().into_iter().map(|token| { + // Replace this with the actual fields you need for the Encoding type + (token.id, String::with_capacity(0), (0, 0), None, 0) + }) + }) + .collect(); + return Ok(tokens); + } }; Ok(self @@ -197,6 +211,7 @@ impl PreTokenizedString { let offset_converter = match offset_type { OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)), OffsetType::Byte => None, + OffsetType::None => None, }; let mut offset = 0;