Skip to content

Commit

Permalink
Perf improvement 16% by removing offsets. (#1587)
Browse files Browse the repository at this point in the history
* [Breaking Change] Perf improvement 16% by removing offsets.

Offsets calculation are always calculated in Python land.
By changing it to not being calculated, we win 16% of the runtime.

This is not the total extent of it because offsets are
still calculated in bytes.

* Required features.

* Remove clippy error.

* Make it non breaking and still show perf improvement.

* Even faster without offsets.

* Update doc.

* Fmt.

* Apply suggestions from code review

Co-authored-by: Arthur <[email protected]>

* fmt.

---------

Co-authored-by: Arthur <[email protected]>
  • Loading branch information
Narsil and ArthurZucker authored Aug 8, 2024
1 parent bd27fa5 commit bfd9cde
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 6 deletions.
14 changes: 8 additions & 6 deletions bindings/python/benches/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,12 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
mergeable_ranks=mergeable_ranks,
special_tokens=special_tokens,
)
enc.encode("warmup")
out = enc.encode("This is a test")

hf_enc = Tokenizer.from_pretrained(model)
out2 = hf_enc.encode("This is a test", add_special_tokens=False).ids

assert out == out2, "sanity check"

start = time.perf_counter_ns()
enc.encode_ordinary_batch(documents, num_threads=num_threads)
Expand All @@ -69,11 +74,9 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
print(f"tiktoken \t{readable_size} / s")

hf_enc = Tokenizer.from_pretrained(model)
hf_enc.encode("warmup")

start = time.perf_counter_ns()
hf_enc.encode_batch(documents)
hf_enc.encode_batch_fast(documents)
end = time.perf_counter_ns()
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
print(f"huggingface \t{readable_size} / s")
Expand All @@ -82,8 +85,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
dataset_xnli = load_dataset(dataset, dataset_config)

# input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths
input_lengths = [(10_000, False, True), (10_000, False, False)]
input_lengths = [(10, False, True), (10_000, False, True), (10_000, False, False)]

for num_threads in threads:
for length, fuse, long in input_lengths:
Expand Down
36 changes: 36 additions & 0 deletions bindings/python/py_src/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,42 @@ class Tokenizer:
"""
pass

def encode_batch_fast(self, input, is_pretokenized=False, add_special_tokens=True):
"""
Encode the given batch of inputs. This method is faster than `encode_batch`
because it doesn't keep track of offsets, they will be all zeros.
Example:
Here are some examples of the inputs that are accepted::
encode_batch_fast([
"A single sequence",
("A tuple with a sequence", "And its pair"),
[ "A", "pre", "tokenized", "sequence" ],
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
])
Args:
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
A list of single sequences or pair sequences to encode. Each sequence
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
argument:
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
Whether the input is already pre-tokenized
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add the special tokens
Returns:
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
"""
pass

@property
def encode_special_tokens(self):
"""
Expand Down
61 changes: 61 additions & 0 deletions bindings/python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1055,6 +1055,67 @@ impl PyTokenizer {
})
}

/// Encode the given batch of inputs. This method is faster than `encode_batch`
/// because it doesn't keep track of offsets, they will be all zeros.
///
/// Example:
/// Here are some examples of the inputs that are accepted::
///
/// encode_batch_fast([
/// "A single sequence",
/// ("A tuple with a sequence", "And its pair"),
/// [ "A", "pre", "tokenized", "sequence" ],
/// ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
/// ])
///
/// Args:
/// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
/// A list of single sequences or pair sequences to encode. Each sequence
/// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
/// argument:
///
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
///
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
/// Whether the input is already pre-tokenized
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add the special tokens
///
/// Returns:
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
///
#[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
#[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
fn encode_batch_fast(
&self,
py: Python<'_>,
input: Vec<&PyAny>,
is_pretokenized: bool,
add_special_tokens: bool,
) -> PyResult<Vec<PyEncoding>> {
let input: Vec<tk::EncodeInput> = input
.into_iter()
.map(|o| {
let input: tk::EncodeInput = if is_pretokenized {
o.extract::<PreTokenizedEncodeInput>()?.into()
} else {
o.extract::<TextEncodeInput>()?.into()
};
Ok(input)
})
.collect::<PyResult<Vec<tk::EncodeInput>>>()?;
py.allow_threads(|| {
ToPyResult(
self.tokenizer
.encode_batch_fast(input, add_special_tokens)
.map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
)
.into()
})
}

/// Decode the given list of ids back to a string
///
/// This is used to decode anything coming back from a Language Model
Expand Down
10 changes: 10 additions & 0 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ harness = false
name = "unigram_benchmark"
harness = false

[[bench]]
name = "llama3"
required-features = ["http"]
harness = false

[dependencies]
lazy_static = "1.4"
rand = "0.8"
Expand Down Expand Up @@ -80,3 +85,8 @@ tracing-subscriber = "0.3.18"

[profile.release]
lto = "fat"

[[example]]
name = "encode_batch"
required-features = ["http"]

42 changes: 42 additions & 0 deletions tokenizers/benches/llama3.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#[macro_use]
extern crate criterion;

use criterion::{Criterion, Throughput};
use tokenizers::Tokenizer;

pub fn llama3(c: &mut Criterion) {
let data = std::fs::read_to_string("data/big.txt").unwrap();
let mut group = c.benchmark_group("llama3-encode");
group.throughput(Throughput::Bytes(data.bytes().len() as u64));
group.bench_function("llama3-offsets", |b| {
let tokenizer =
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
let data: Vec<_> = data.lines().collect();
let add_special_tokens = false;
b.iter(|| {
tokenizer
.encode_batch_char_offsets(criterion::black_box(data.clone()), add_special_tokens)
.unwrap()
})
});
group.bench_function("llama3-nooffsets", |b| {
let tokenizer =
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
let data: Vec<_> = data.lines().collect();
let add_special_tokens = false;
b.iter(|| {
tokenizer
.encode_batch(criterion::black_box(data.clone()), add_special_tokens)
.unwrap()
})
});
group.finish();
}

criterion_group! {
name = bert_benches;
config = Criterion::default().sample_size(10);
targets = llama3
}

criterion_main!(bert_benches);
11 changes: 11 additions & 0 deletions tokenizers/examples/encode_batch.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
use tokenizers::Tokenizer;

fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None)?;

let data = std::fs::read_to_string("data/big.txt")?;
let data: Vec<_> = data.lines().collect();
let add_special_tokens = false;
tokenizer.encode_batch_char_offsets(data, add_special_tokens)?;
Ok(())
}
64 changes: 64 additions & 0 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,48 @@ where
}
}

/// Encode the given input. This method accepts both single sequences, as well as pair
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
/// Contrarily to `encode`, it does not compute offsets
/// ```
/// # use tokenizers::Tokenizer;
/// # use tokenizers::models::bpe::BPE;
/// # let mut tokenizer = Tokenizer::new(BPE::default());
/// #
/// // Sequences:
/// tokenizer.encode_fast("Single sequence", false);
/// tokenizer.encode_fast(("Sequence A", "Sequence B"), false);
///
/// // Pre-tokenized sequences:
/// tokenizer.encode_fast(&["Single", "sequence"][..], false);
/// tokenizer.encode_fast((
/// &["Sequence", "A"][..],
/// &["Sequence", "B"][..]
/// ), false);
///
/// // or even both types together:
/// tokenizer.encode_fast(("A complete sequence", &["And", "a", "tokenized"][..]), false);
/// ```
pub fn encode_fast<'s, E>(&self, input: E, add_special_tokens: bool) -> Result<Encoding>
where
E: Into<EncodeInput<'s>>,
{
// Extract sequences from the EncodeInput
let (sequence, pair) = match input.into() {
EncodeInput::Single(s1) => (s1, None),
EncodeInput::Dual(s1, s2) => (s1, Some(s2)),
};

// Encode each sequence
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::None)?;
let pair_encoding = pair
.map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::None))
.transpose()?;

// And finally post process
self.post_process(encoding, pair_encoding, add_special_tokens)
}

/// Encode the given input. This method accepts both single sequences, as well as pair
/// sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
///
Expand Down Expand Up @@ -1059,6 +1101,28 @@ where
Ok(encodings)
}

/// Encode all the sentences in parallel, using multiple threads
pub fn encode_batch_fast<'s, E>(
&self,
inputs: Vec<E>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>>
where
E: Into<EncodeInput<'s>> + Send,
{
let mut encodings = inputs
.into_maybe_par_iter()
.map(|input| self.encode_fast(input, add_special_tokens))
.collect::<Result<Vec<Encoding>>>()?;

if let Some(params) = &self.padding {
// We do the padding here to make sure we handle the batch padding
pad_encodings(&mut encodings, params)?;
}

Ok(encodings)
}

/// Decode all sentences in parallel
pub fn decode_batch(
&self,
Expand Down
15 changes: 15 additions & 0 deletions tokenizers/src/tokenizer/pre_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::collections::HashMap;
pub enum OffsetType {
Byte,
Char,
None,
}

/// Wrapper for a subpart of a `NormalizedString`.
Expand Down Expand Up @@ -146,6 +147,19 @@ impl PreTokenizedString {
let offset_converter = match offset_type {
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
OffsetType::Byte => None,
OffsetType::None => {
let tokens = self
.splits
.into_iter()
.flat_map(|split| {
split.tokens.unwrap().into_iter().map(|token| {
// Replace this with the actual fields you need for the Encoding type
(token.id, String::with_capacity(0), (0, 0), None, 0)
})
})
.collect();
return Ok(tokens);
}
};

Ok(self
Expand Down Expand Up @@ -197,6 +211,7 @@ impl PreTokenizedString {
let offset_converter = match offset_type {
OffsetType::Char => Some(BytesToCharOffsetConverter::new(&self.original)),
OffsetType::Byte => None,
OffsetType::None => None,
};

let mut offset = 0;
Expand Down

0 comments on commit bfd9cde

Please sign in to comment.