From 8bd9f33e02045fa35588fdd0659058d19b9ef393 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 20 Jun 2024 16:01:51 +0200 Subject: [PATCH 01/17] initial commit --- tokenizers/src/tokenizer/added_vocabulary.rs | 20 ++++++++++++++++++++ tokenizers/src/tokenizer/pre_tokenizer.rs | 19 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index a0c2f4542..45eb9b725 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -425,6 +425,26 @@ impl AddedVocabulary { splits } + fn fast_split_with_indices( + &self, + sentence: NormalizedString, + split_re: &MatchingSet, + ) -> Vec<(NormalizedString, Option>)> { + self.find_matches(sentence.get(), split_re) + .into_iter() + .map(|(id, byte_offsets)| { + let slice = sentence + .slice(Range::Normalized(byte_offsets.0..byte_offsets.1)) + .expect("AddedVocabulary bad split"); + if let Some(id) = id { + (slice, Some(vec![Token::new(id, String::new(), (0, 0))])) + } else { + (slice, None) + } + }) + .collect() + } + /// Split the input sentence to extract anything we found from the `MatchingSet`, as well as /// the list of corresponding IDs /// The list of IDs have the exact same number of elements than the Iterator. diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs index 54e24f76a..c645d0da9 100644 --- a/tokenizers/src/tokenizer/pre_tokenizer.rs +++ b/tokenizers/src/tokenizer/pre_tokenizer.rs @@ -186,6 +186,25 @@ impl PreTokenizedString { } } + pub fn fast_into_encoding(self) -> Result { + if self.splits.is_empty() { + Ok(Encoding::default()) + } else if !self.splits.iter().all(|split| split.tokens.is_some()) { + Err("Split has not been tokenized.".into()) + } else { + let tokens = self + .splits + .into_iter() + .flat_map(|split| { + split.tokens.unwrap().into_iter().map(|token| { + // Replace this with the actual fields you need for the Encoding type + (token.id, String::new(), (0, 0), None, 0) + }) + }) + .collect(); + Ok(tokens) + } + } /// Returns a list of splits, each of them being a slice of the normalized /// string, the associated offsets either in original or normalized /// referential, as well as the potention tokens From 9b5f433f58f836b71efbf1ddd82aa0e887a79ddc Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 20 Jun 2024 16:19:42 +0200 Subject: [PATCH 02/17] sounds fun --- tokenizers/src/tokenizer/added_vocabulary.rs | 43 +++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 45eb9b725..6c46c456c 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -92,6 +92,25 @@ impl std::hash::Hash for AddedToken { } } +use std::num::NonZeroU64; +use std::thread; + +pub struct FakeThreadId(NonZeroU64); + +fn hash_current_thread() -> usize { + // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter + // that works great for our use case of avoiding collisions in our array. Unfortunately, + // it's private. However, there are only so many ways you can layout a u64, so just transmute + // https://github.com/rust-lang/rust/issues/67939 + const _: [u8; 8] = [0; std::mem::size_of::()]; + const _: [u8; 8] = [0; std::mem::size_of::()]; + let x = + unsafe { std::mem::transmute::(thread::current().id()).0 }; + u64::from(x) as usize +} + +const MAX_NUM_THREADS: usize = 128; + type MatchingSet = (AhoCorasick, Vec); lazy_static! { @@ -156,11 +175,16 @@ pub struct AddedVocabulary { /// us remove them easily with an O(1) complexity. special_tokens_set: HashSet, - /// A RegexSet containing all the non-normalized patterns used to split on AddedTokens + //// A RegexSet containing all the non-normalized patterns used to split on AddedTokens split_trie: MatchingSet, /// A RegexSet containing all the normalized patterns used to split on AddedTokens split_normalized_trie: MatchingSet, + // A RegexSet containing all the non-normalized patterns used to split on AddedTokens + split_trie_vec: Vec, + /// A RegexSet containing all the normalized patterns used to split on AddedTokens + split_normalized_trie_vec: Vec, + /// Whether or not special tokens should be splitted when encoding. This is equivalent to ignoring them encode_special_tokens: bool, } @@ -181,8 +205,10 @@ impl AddedVocabulary { added_tokens: vec![], special_tokens: vec![], special_tokens_set: HashSet::new(), - split_trie: (trie, vec![]), - split_normalized_trie: (normalized_trie, vec![]), + split_trie: (trie.clone(), vec![]), + split_normalized_trie: (normalized_trie.clone(), vec![]), + split_trie_vec: vec![(trie, vec![])], + split_normalized_trie_vec: vec![(normalized_trie, vec![])], encode_special_tokens: false, } } @@ -437,7 +463,7 @@ impl AddedVocabulary { .slice(Range::Normalized(byte_offsets.0..byte_offsets.1)) .expect("AddedVocabulary bad split"); if let Some(id) = id { - (slice, Some(vec![Token::new(id, String::new(), (0, 0))])) + (slice, Some(vec![Token::new(id, String::new(), (0,))])) } else { (slice, None) } @@ -485,7 +511,12 @@ impl AddedVocabulary { // 1. We extract all the non-normalized tokens from the non-normalized string pretokenized - .split(|_, sequence| Ok(self.split_with_indices(sequence, &self.split_trie))) + .split(|_, sequence| { + Ok(self.split_with_indices( + sequence, + &self.split_trie_vec[hash_current_thread() % MAX_NUM_THREADS], + )) + }) .expect("AddedVocabulary bad split"); // normalized = False @@ -504,7 +535,7 @@ impl AddedVocabulary { pretokenized .split(|_, mut sequence| { normalizer.map(|n| n.normalize(&mut sequence)); - Ok(self.split_with_indices(sequence, &self.split_normalized_trie)) + Ok(self.split_with_indices(sequence, &self.split_normalized_trie_vec)) }) .expect("AddedVocabulary bad split"); From 079040ea245772ecc4835df09c5b1a055aec51ec Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 20 Jun 2024 16:23:04 +0200 Subject: [PATCH 03/17] what I hope for --- tokenizers/src/tokenizer/added_vocabulary.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 6c46c456c..5e5a42e32 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -371,6 +371,7 @@ impl AddedVocabulary { .build(tokens.iter().map(|token| &token.content)) .expect("Failed to build tried when refreshing tokens"); self.split_trie = (trie, ids); + self.split_trie_vec = vec![self.split_trie.clone(); MAX_NUM_THREADS]; let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); let patterns: Vec<_> = ntokens @@ -388,6 +389,7 @@ impl AddedVocabulary { .build(patterns.iter().map(|content| content.get())) .expect("Failed to build tried when refreshing tokens (normalized)"); self.split_normalized_trie = (normalized_trie, nids); + self.split_normalized_trie_vec = vec![self.split_normalized_trie.clone(); MAX_NUM_THREADS]; } /// Find any AddedToken in the given sentence, using the provided MatchingSet. From fddbbf839b12e8f748f7ef9d0d45428f289cdca1 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 20 Jun 2024 16:25:32 +0200 Subject: [PATCH 04/17] nit --- tokenizers/src/tokenizer/added_vocabulary.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 5e5a42e32..59e12a07a 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -465,7 +465,7 @@ impl AddedVocabulary { .slice(Range::Normalized(byte_offsets.0..byte_offsets.1)) .expect("AddedVocabulary bad split"); if let Some(id) = id { - (slice, Some(vec![Token::new(id, String::new(), (0,))])) + (slice, Some(vec![Token::new(id, String::new(), (0, 0))])) } else { (slice, None) } @@ -537,7 +537,10 @@ impl AddedVocabulary { pretokenized .split(|_, mut sequence| { normalizer.map(|n| n.normalize(&mut sequence)); - Ok(self.split_with_indices(sequence, &self.split_normalized_trie_vec)) + Ok(self.split_with_indices( + sequence, + &self.split_normalized_trie_vec[hash_current_thread() % MAX_NUM_THREADS], + )) }) .expect("AddedVocabulary bad split"); From 0e61735487b473de81e8c96b112d27ed2cdf4fec Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 20 Jun 2024 16:30:14 +0200 Subject: [PATCH 05/17] oiups --- tokenizers/src/tokenizer/added_vocabulary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 59e12a07a..c409a6b57 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -106,7 +106,7 @@ fn hash_current_thread() -> usize { const _: [u8; 8] = [0; std::mem::size_of::()]; let x = unsafe { std::mem::transmute::(thread::current().id()).0 }; - u64::from(x) as usize + u64::from(x) as usize - 1 } const MAX_NUM_THREADS: usize = 128; From 085b06884a88f94b67c9a957d5fcd21e41618cdf Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 20 Jun 2024 16:32:14 +0200 Subject: [PATCH 06/17] nit --- tokenizers/src/tokenizer/added_vocabulary.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index c409a6b57..1d663cfd0 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -207,8 +207,8 @@ impl AddedVocabulary { special_tokens_set: HashSet::new(), split_trie: (trie.clone(), vec![]), split_normalized_trie: (normalized_trie.clone(), vec![]), - split_trie_vec: vec![(trie, vec![])], - split_normalized_trie_vec: vec![(normalized_trie, vec![])], + split_trie_vec: vec![(trie, vec![]); MAX_NUM_THREADS], + split_normalized_trie_vec: vec![(normalized_trie, vec![]); MAX_NUM_THREADS], encode_special_tokens: false, } } From 459fe629a781d49efd353bcc836b0135bec88f06 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 20 Jun 2024 16:47:58 +0200 Subject: [PATCH 07/17] Is this what's expected? --- tokenizers/src/pre_tokenizers/byte_level.rs | 29 +++++++++++++++++++- tokenizers/src/tokenizer/added_vocabulary.rs | 2 +- tokenizers/src/tokenizer/mod.rs | 2 +- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 2d3845b55..13d3fbef4 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -41,6 +41,14 @@ lazy_static! { r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" ) .unwrap(); + static ref RE_VEC: Vec = { + let pattern = r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"; + let mut vec = Vec::with_capacity(MAX_NUM_THREADS); + for _ in 0..MAX_NUM_THREADS { + vec.push(SysRegex::new(pattern).unwrap()); + } + vec + }; static ref BYTES_CHAR: HashMap = bytes_char(); static ref CHAR_BYTES: HashMap = bytes_char().into_iter().map(|(c, b)| (b, c)).collect(); @@ -111,12 +119,31 @@ impl ByteLevel { } } +use std::num::NonZeroU64; +use std::thread; + +pub struct FakeThreadId(NonZeroU64); + +fn hash_current_thread() -> usize { + // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter + // that works great for our use case of avoiding collisions in our array. Unfortunately, + // it's private. However, there are only so many ways you can layout a u64, so just transmute + // https://github.com/rust-lang/rust/issues/67939 + const _: [u8; 8] = [0; std::mem::size_of::()]; + const _: [u8; 8] = [0; std::mem::size_of::()]; + let x = + unsafe { std::mem::transmute::(thread::current().id()).0 }; + u64::from(x) as usize - 1 +} + +const MAX_NUM_THREADS: usize = 128; + /// As a `PreTokenizer`, `ByteLevel` is in charge of transforming all the unicode characters into /// their byte-level counterpart. It also splits the input according to the configured regex. // TODO: Give the ability to modify this regex impl PreTokenizer for ByteLevel { fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { - let re_ref: &SysRegex = &RE; + let re_ref: &SysRegex = &RE_VEC[hash_current_thread() % MAX_NUM_THREADS]; // TODO use the thread thing here as well! pretokenized.split(|_, mut normalized| { if self.add_prefix_space && !normalized.get().starts_with(' ') { normalized.prepend(" "); diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 1d663cfd0..7ed2ef8b0 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -514,7 +514,7 @@ impl AddedVocabulary { // 1. We extract all the non-normalized tokens from the non-normalized string pretokenized .split(|_, sequence| { - Ok(self.split_with_indices( + Ok(self.fast_split_with_indices( sequence, &self.split_trie_vec[hash_current_thread() % MAX_NUM_THREADS], )) diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 99e2b7127..98cf9adbe 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -882,7 +882,7 @@ where ) -> Result { let mut pretokenized: PreTokenizedString = pretokenized.into(); pretokenized.tokenize(|normalized| self.model.tokenize(normalized.get()))?; - pretokenized.into_encoding(word_idx, type_id, offsets_type) + pretokenized.fast_into_encoding() } } From 5b621030a352c97f271938cc739e39166c18cf63 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 20 Jun 2024 18:52:52 +0200 Subject: [PATCH 08/17] just testing some sutff --- tokenizers/benches/bpe_benchmark.rs | 4 ++-- tokenizers/src/pre_tokenizers/byte_level.rs | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tokenizers/benches/bpe_benchmark.rs b/tokenizers/benches/bpe_benchmark.rs index dd65d233e..3589d449a 100644 --- a/tokenizers/benches/bpe_benchmark.rs +++ b/tokenizers/benches/bpe_benchmark.rs @@ -24,8 +24,8 @@ fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer { let mut tokenizer = Tokenizer::new(bpe); tokenizer.with_pre_tokenizer(ByteLevel::default()); tokenizer.with_decoder(ByteLevel::default()); - tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]); - tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]); + // tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]); + // tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]); tokenizer } diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 13d3fbef4..e458d7864 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -169,7 +169,8 @@ impl PreTokenizer for ByteLevel { .map(|(i, b)| (BYTES_CHAR[b], isize::from(i > 0))), ); } - normalized.transform(transformations, 0); + // normalized.transform(transformations, 0); // TODO here what whould happen if we ignore + // aligments? Ok(()) }) } @@ -199,6 +200,7 @@ impl Decoder for ByteLevel { } } +// TODO this is also somewhere we want to just skip if we are fast /// As a `PostProcessor`, `ByteLevel` is in charge of trimming the offsets if necessary. impl PostProcessor for ByteLevel { fn added_tokens(&self, _is_pair: bool) -> usize { From 6499eb2044dbb5d4ea7fdc9ee002294161eb106e Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Fri, 12 Jul 2024 16:14:09 +0200 Subject: [PATCH 09/17] add test --- tokenizers/Cargo.toml | 4 ++++ tokenizers/benches/bert_decode_benchmark.rs | 25 +++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 tokenizers/benches/bert_decode_benchmark.rs diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 038486880..e29bd8d41 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -36,6 +36,10 @@ harness = false name = "unigram_benchmark" harness = false +[[bench]] +name = "bert_decode_benchmark" +harness = false + [dependencies] lazy_static = "1.4" rand = "0.8" diff --git a/tokenizers/benches/bert_decode_benchmark.rs b/tokenizers/benches/bert_decode_benchmark.rs new file mode 100644 index 000000000..a209feac4 --- /dev/null +++ b/tokenizers/benches/bert_decode_benchmark.rs @@ -0,0 +1,25 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use tokenizers::tokenizer::Tokenizer; + +fn decode(tokenizer: &Tokenizer, ids_slice: Vec, skip_special_tokens: bool) -> String { + tokenizer + .decode(&ids_slice, skip_special_tokens) + .expect("failed to decode input") +} + +fn criterion_benchmark(c: &mut Criterion) { + let tokenizer = + Tokenizer::from_file("data/bert-wiki.json").expect("failed to create tokenizer"); + c.bench_function("decode", |b| { + b.iter(|| { + decode( + &tokenizer, + black_box([2829, 4419, 14523, 2058, 1996, 13971, 3899].to_vec()), + black_box(true), + ) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From d50ee79370244284046d0b3a363dbaf9a8549c50 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Sat, 13 Jul 2024 12:28:29 +0200 Subject: [PATCH 10/17] important bench? --- tokenizers/Cargo.toml | 4 ++ .../benches/important_tokenizer_benchmark.rs | 59 +++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 tokenizers/benches/important_tokenizer_benchmark.rs diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index e29bd8d41..cd3e5c89f 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -40,6 +40,10 @@ harness = false name = "bert_decode_benchmark" harness = false +[[bench]] +name = "important_tokenizer_benchmark" +harness = false + [dependencies] lazy_static = "1.4" rand = "0.8" diff --git a/tokenizers/benches/important_tokenizer_benchmark.rs b/tokenizers/benches/important_tokenizer_benchmark.rs new file mode 100644 index 000000000..89b4ea46f --- /dev/null +++ b/tokenizers/benches/important_tokenizer_benchmark.rs @@ -0,0 +1,59 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use tokenizers::tokenizer::Tokenizer; +extern crate criterion; + +mod common; + +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::path::Path; + +use tokenizers::tokenizer::EncodeInput; + +use common::{iter_bench_encode, iter_bench_encode_batch}; +use std::ops::Deref; +use std::time::{Duration, Instant}; + +static BATCH_SIZE: usize = usize::MAX; + +fn bench_inference(c: &mut Criterion) { + let tokenizer = Tokenizer::from_pretrained("mistralai/Mistral-7B-v0.1", None).unwrap(); + let mut lines: Vec = vec![]; + let mut batches: Vec> = vec![vec![]]; + for line in BufReader::new(File::open(Path::new("data/big.txt")).unwrap()).lines() { + let line: EncodeInput = line.unwrap().into(); + lines.push(line.clone()); + if batches.last().unwrap().len() >= BATCH_SIZE { + batches.push(vec![]); + } + batches.last_mut().unwrap().push(line); + } + + c.bench_function("mistral encode long input", |b| { + b.iter_custom(|iters| iter_bench_encode(iters, tokenizer.deref(), &lines)) + }); + + c.bench_function("encode single batch of very long input", |b| { + b.iter_custom(|iters| iter_bench_encode_batch(iters, tokenizer.deref(), &batches)) + }); + + c.bench_function("decode long input", |b| { + b.iter_custom(|iters| { + let mut duration = Duration::new(0, 0); + let mut line_index: usize = 0; + for _i in 0..iters { + if line_index >= lines.len() { + line_index = 0; + } + let input = batches[0].clone(); + let start = Instant::now(); + let _ = black_box(tokenizer.encode_batch(input, false)); + duration = duration.checked_add(start.elapsed()).unwrap(); + } + duration + }) + }); +} + +criterion_group!(benches, bench_inference); +criterion_main!(benches); From c2b365564e28f7e8fa83061a883f0ff8eb888805 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Sat, 13 Jul 2024 19:39:31 +0200 Subject: [PATCH 11/17] update --- bindings/python/Cargo.lock | 1327 +++++++++++++++++++++++++ bindings/python/Cargo.toml | 4 + bindings/python/benches/bench_gpt2.py | 76 ++ 3 files changed, 1407 insertions(+) create mode 100644 bindings/python/Cargo.lock create mode 100644 bindings/python/benches/bench_gpt2.py diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock new file mode 100644 index 000000000..f4f8f1632 --- /dev/null +++ b/bindings/python/Cargo.lock @@ -0,0 +1,1327 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" + +[[package]] +name = "anstyle-parse" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "libc", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfaff671f6b22ca62406885ece523383b9b64022e341e53e009a62ebc47a45f2" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a216b506622bb1d316cd51328dce24e07bdff4a6128a47c7e7fad11878d5adbb" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "clap_lex" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" + +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "console" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c926e00cc70edefdc64d3a5ff31cc65bb97a3460097762bd23afb4d8145fccf8" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "unicode-width", + "windows-sys 0.45.0", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fca89a0e215bab21874660c67903c5f143333cab1da83d041c7ded6053774751" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2fe95351b870527a5d09bf563ed3c97c0cffb87cf1c78a591bf48bb218d9aa" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d96137f14f244c37f989d9fff8f95e6c18b918e71f36638f8c49112e4c78f" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "darling" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 1.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +dependencies = [ + "darling_core", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder_macro" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" +dependencies = [ + "derive_builder_core", + "syn 1.0.109", +] + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + +[[package]] +name = "env_logger" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95b3f3e67048839cb0d0781f445682a35113da7121f7c949db0e2be96a4fbece" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +dependencies = [ + "cc", +] + +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "getrandom" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "hermit-abi" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "indicatif" +version = "0.17.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25" +dependencies = [ + "console", + "instant", + "number_prefix", + "portable-atomic", + "unicode-width", +] + +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "is-terminal" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +dependencies = [ + "hermit-abi", + "rustix", + "windows-sys 0.48.0", +] + +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.151" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" + +[[package]] +name = "linux-raw-sys" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" + +[[package]] +name = "lock_api" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "macro_rules_attribute" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" + +[[package]] +name = "matrixmultiply" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "memchr" +version = "2.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" + +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +dependencies = [ + "autocfg", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "monostate" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e404e13820ea0df0eda93aa294e0c80de76a0daa6bec590d376fbec6d7810394" +dependencies = [ + "monostate-impl", + "serde", +] + +[[package]] +name = "monostate-impl" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531c82a934da419bed3da09bd87d6e98c72f8d4aa755427b3b009c2b8b8c433c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "num-complex" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" +dependencies = [ + "autocfg", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "numpy" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef41cbb417ea83b30525259e30ccef6af39b31c240bda578889494c5392d331" +dependencies = [ + "libc", + "ndarray", + "num-complex", + "num-integer", + "num-traits", + "pyo3", + "rustc-hash", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "onig" +version = "6.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" +dependencies = [ + "bitflags 1.3.2", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" +dependencies = [ + "cc", + "pkg-config", +] + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.48.5", +] + +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + +[[package]] +name = "pkg-config" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" + +[[package]] +name = "portable-atomic" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "proc-macro2" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39278fbbf5fb4f646ce651690877f89d1c5811a3d4acb27700c1cb3cdb78fd3b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53bdbb96d49157e65d45cc287af5f32ffadd5f4761438b527b055fb0d4bb8233" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "parking_lot", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deaa5745de3f5231ce10517a1f5dd97d53e5a2fd77aa6b5842292085831d48d7" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b42531d03e08d4ef1f6e85a2ed422eb678b8cd62b762e53891c05faf0d4afa" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7305c720fa01b8055ec95e484a6eca7a83c841267f0dd5280f0c8b8551d2c158" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c7e9b68bb9c3149c5b0cade5d07f953d6d125eb4337723c4ccdb665f1f96185" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +dependencies = [ + "either", + "itertools", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax 0.8.2", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.8.2", +] + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustix" +version = "0.38.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "ryu" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.193" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.193" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "serde_json" +version = "1.0.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "smallvec" +version = "1.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970" + +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64", + "nom", + "serde", + "unicode-segmentation", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c8b28c477cc3bf0e7966561e3460130e1255f7a1cf71931075f1c5e7a7e269" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c39fd04924ca3a864207c66fc2cd7d22d7c016007f9ce846cbb9326331930a" + +[[package]] +name = "tempfile" +version = "3.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys 0.48.0", +] + +[[package]] +name = "termcolor" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff1bc3d3f05aff0403e8ac0d92ced918ec05b666a43f83297ccef5bea8a3d449" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "thiserror" +version = "1.0.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f11c217e1416d6f036b870f14e0413d480dbf28edbee1f877abaf0206af43bb7" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01742297787513b79cf8e29d1056ede1313e2420b7b3b15d0a768b4921f549df" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "tokenizers" +version = "0.15.1" +dependencies = [ + "aho-corasick", + "clap", + "derive_builder", + "esaxx-rs", + "getrandom", + "indicatif", + "itertools", + "lazy_static", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax 0.7.5", + "serde", + "serde_json", + "spm_precompiled", + "thiserror", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + +[[package]] +name = "tokenizers-python" +version = "0.15.1" +dependencies = [ + "env_logger", + "itertools", + "libc", + "ndarray", + "numpy", + "onig", + "pyo3", + "rayon", + "serde", + "serde_json", + "tempfile", + "tokenizers", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + +[[package]] +name = "unicode-width" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + +[[package]] +name = "unindent" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" + +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.2", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.0", +] + +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 3b1b1bbf1..c12e8a9d7 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -29,3 +29,7 @@ pyo3 = { version = "0.21", features = ["auto-initialize"] } [features] defaut = ["pyo3/extension-module"] + +[profile.profiling] +inherits = "release" +debug = true diff --git a/bindings/python/benches/bench_gpt2.py b/bindings/python/benches/bench_gpt2.py new file mode 100644 index 000000000..2a4dd379c --- /dev/null +++ b/bindings/python/benches/bench_gpt2.py @@ -0,0 +1,76 @@ +import base64 +import functools +import gzip +import json +import os +import random +import time +from typing import Any, cast + +import blobfile + +import tiktoken +from tokenizers import Tokenizer + + +def format_byte_size(num_bytes: int) -> str: + """Convert bytes to a human-readable format (KB, MB, GB).""" + for unit in ["B", "KB", "MB", "GB", "TB"]: + if num_bytes < 1024: + return f"{num_bytes:.2f} {unit}", unit + num_bytes /= 1024 + return f"{num_bytes:.2f} PB", unit + + +def benchmark_batch(documents: list[str]) -> None: + num_threads = int(os.environ["RAYON_NUM_THREADS"]) + num_bytes = sum(map(len, map(str.encode, documents))) + readable_size, unit = format_byte_size(num_bytes) + print(f"num_threads: {num_threads}, data size: {readable_size}") + enc = tiktoken.get_encoding("gpt2") + enc.encode("warmup") + + start = time.perf_counter_ns() + enc.encode_ordinary_batch(documents, num_threads=num_threads) + end = time.perf_counter_ns() + + readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9) + print(f"tiktoken \t{readable_size} / s") + + hf_enc = Tokenizer.from_pretrained("gpt2") + hf_enc.encode("warmup") + + start = time.perf_counter_ns() + hf_enc.encode_batch(documents) + end = time.perf_counter_ns() + readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9) + print(f"huggingface \t{readable_size} / s") + + +import os +import time +import tqdm +from datasets import load_dataset +import tiktoken + + +def test_on_xnli(): + dataset_xnli = load_dataset("facebook/xnli", "all_languages") + + # Varying the number of threads and length of input + num_threads_list = [1, 4, 8, 16, 32] # Example thread counts + input_lengths = [10, 100, 1000, 10_000] # Example input lengths + + documents = ["".join(item["premise"].values()) for item in dataset_xnli["train"]] + for num_threads in num_threads_list: + os.environ["RAYON_NUM_THREADS"] = str(num_threads) + os.environ["TOKENIZER_PARALLELISM"] = str(num_threads) + os.environ["RAYON_RS_NUM_THREADS"] = str(num_threads) + for length in input_lengths: + if length == 100_000 and num_threads == 1: + break + benchmark_batch(documents[:length]) + + +# Call the function to run the benchmark +test_on_xnli() From 7038928b76196077ddb467d228cd226dc8f623ad Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 22 Jul 2024 13:42:27 +0200 Subject: [PATCH 12/17] push fast path --- tokenizers/Cargo.toml | 1 + tokenizers/src/lib.rs | 2 ++ tokenizers/src/tokenizer/mod.rs | 29 +++++++++++++++++++++-------- tokenizers/src/utils/parallelism.rs | 2 +- 4 files changed, 25 insertions(+), 9 deletions(-) diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index cd3e5c89f..bf7c6cab3 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -71,6 +71,7 @@ fancy-regex = { version = "0.13", optional = true} getrandom = { version = "0.2.10" } esaxx-rs = { version = "0.1.10", default-features = false, features=[]} monostate = "0.1.12" +mimalloc = "0.1" [features] default = ["progressbar", "onig", "esaxx_fast"] diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index eb89b9315..071a72e50 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -151,3 +151,5 @@ pub use utils::parallelism; // Re-export for from_pretrained #[cfg(feature = "http")] pub use utils::from_pretrained::FromPretrainedParameters; +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 98cf9adbe..d4b04ddd1 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -9,6 +9,10 @@ //! - [`PostProcessor`](trait.PostProcessor.html): Takes care of the processing after tokenization (like truncating, padding, //! ...). +use rayon::ThreadPoolBuilder; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use std::env; use std::{ collections::HashMap, fs::{read_to_string, File}, @@ -18,9 +22,6 @@ use std::{ path::{Path, PathBuf}, }; -use serde::de::DeserializeOwned; -use serde::{Deserialize, Serialize}; - use crate::utils::iter::ResultShunt; use crate::utils::parallelism::*; use crate::utils::progress::{ProgressBar, ProgressStyle}; @@ -1047,11 +1048,23 @@ where where E: Into> + Send, { - let mut encodings = inputs - .into_maybe_par_iter() - .map(|input| self.encode_char_offsets(input, add_special_tokens)) - .collect::>>()?; - + let num_threads = env::var("RAYON_NUM_THREADS") + .unwrap_or_else(|_| "4".to_string()) // Provide a default value of 12 if the variable is not set + .parse::() + .expect("RAYON_NUM_THREADS must be a valid integer"); + + // Create the thread pool with the specified number of threads + let pool = ThreadPoolBuilder::new() + .num_threads(num_threads) + .build() + .unwrap(); + let mut encodings = pool.install(|| { + let result = inputs + .into_maybe_par_iter() + .map(|input| self.encode_char_offsets(input, add_special_tokens)) + .collect::>>(); + result + })?; if let Some(params) = &self.padding { // We do the padding here to make sure we handle the batch padding pad_encodings(&mut encodings, params)?; diff --git a/tokenizers/src/utils/parallelism.rs b/tokenizers/src/utils/parallelism.rs index b955731d1..a59d7102f 100644 --- a/tokenizers/src/utils/parallelism.rs +++ b/tokenizers/src/utils/parallelism.rs @@ -73,7 +73,7 @@ where if parallelism { USED_PARALLELISM.store(true, Ordering::SeqCst); } - CondIterator::new(self, parallelism) + CondIterator::new(self, true) } fn into_maybe_par_iter_cond(self, cond: bool) -> CondIterator { From 11dc00ac357b9c747d5fb0536462b5ba8efb2496 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 22 Jul 2024 19:12:58 +0200 Subject: [PATCH 13/17] better bench --- bindings/python/benches/bench_gpt2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) mode change 100644 => 100755 bindings/python/benches/bench_gpt2.py diff --git a/bindings/python/benches/bench_gpt2.py b/bindings/python/benches/bench_gpt2.py old mode 100644 new mode 100755 index 2a4dd379c..7809d643d --- a/bindings/python/benches/bench_gpt2.py +++ b/bindings/python/benches/bench_gpt2.py @@ -1,3 +1,4 @@ +#!/opt/homebrew/bin/python3.12 import base64 import functools import gzip @@ -58,8 +59,8 @@ def test_on_xnli(): dataset_xnli = load_dataset("facebook/xnli", "all_languages") # Varying the number of threads and length of input - num_threads_list = [1, 4, 8, 16, 32] # Example thread counts - input_lengths = [10, 100, 1000, 10_000] # Example input lengths + num_threads_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 32] # Example thread counts + input_lengths = [10_000] # Example input lengths documents = ["".join(item["premise"].values()) for item in dataset_xnli["train"]] for num_threads in num_threads_list: From 7708bfc8ad33a0337184c96213d48cc81b0ddcc8 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 24 Jul 2024 19:35:33 +0200 Subject: [PATCH 14/17] nit --- bindings/python/Cargo.toml | 4 ++-- tokenizers/Cargo.toml | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index c12e8a9d7..376523ded 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -30,6 +30,6 @@ pyo3 = { version = "0.21", features = ["auto-initialize"] } [features] defaut = ["pyo3/extension-module"] -[profile.profiling] -inherits = "release" +[profile.release] debug = true +strip = false \ No newline at end of file diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index bf7c6cab3..d08953f9a 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -88,4 +88,5 @@ tracing = "0.1" tracing-subscriber = "0.3.18" [profile.release] -lto = "fat" +debug = true +strip = false \ No newline at end of file From 214e117a72f8ba4e188c03504882358731ad8803 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 24 Jul 2024 19:46:49 +0200 Subject: [PATCH 15/17] push --- bindings/python/benches/bench_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/benches/bench_gpt2.py b/bindings/python/benches/bench_gpt2.py index 7809d643d..e6a64abe3 100755 --- a/bindings/python/benches/bench_gpt2.py +++ b/bindings/python/benches/bench_gpt2.py @@ -38,7 +38,7 @@ def benchmark_batch(documents: list[str]) -> None: readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9) print(f"tiktoken \t{readable_size} / s") - hf_enc = Tokenizer.from_pretrained("gpt2") + hf_enc = Tokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") hf_enc.encode("warmup") start = time.perf_counter_ns() From b6bdcb80282b4ec08437ba488b7b26dd5085b2d6 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 24 Jul 2024 20:09:45 +0200 Subject: [PATCH 16/17] revert and cleanup --- tokenizers/src/lib.rs | 2 -- tokenizers/src/tokenizer/added_vocabulary.rs | 22 +------------------- tokenizers/src/tokenizer/mod.rs | 3 ++- tokenizers/src/tokenizer/pre_tokenizer.rs | 19 ----------------- tokenizers/src/utils/parallelism.rs | 2 +- 5 files changed, 4 insertions(+), 44 deletions(-) diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index 071a72e50..eb89b9315 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -151,5 +151,3 @@ pub use utils::parallelism; // Re-export for from_pretrained #[cfg(feature = "http")] pub use utils::from_pretrained::FromPretrainedParameters; -#[global_allocator] -static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 7ed2ef8b0..98af3cb46 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -453,26 +453,6 @@ impl AddedVocabulary { splits } - fn fast_split_with_indices( - &self, - sentence: NormalizedString, - split_re: &MatchingSet, - ) -> Vec<(NormalizedString, Option>)> { - self.find_matches(sentence.get(), split_re) - .into_iter() - .map(|(id, byte_offsets)| { - let slice = sentence - .slice(Range::Normalized(byte_offsets.0..byte_offsets.1)) - .expect("AddedVocabulary bad split"); - if let Some(id) = id { - (slice, Some(vec![Token::new(id, String::new(), (0, 0))])) - } else { - (slice, None) - } - }) - .collect() - } - /// Split the input sentence to extract anything we found from the `MatchingSet`, as well as /// the list of corresponding IDs /// The list of IDs have the exact same number of elements than the Iterator. @@ -514,7 +494,7 @@ impl AddedVocabulary { // 1. We extract all the non-normalized tokens from the non-normalized string pretokenized .split(|_, sequence| { - Ok(self.fast_split_with_indices( + Ok(self.split_with_indices( sequence, &self.split_trie_vec[hash_current_thread() % MAX_NUM_THREADS], )) diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index d4b04ddd1..550046982 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -883,7 +883,7 @@ where ) -> Result { let mut pretokenized: PreTokenizedString = pretokenized.into(); pretokenized.tokenize(|normalized| self.model.tokenize(normalized.get()))?; - pretokenized.fast_into_encoding() + pretokenized.into_encoding(word_idx, type_id, offsets_type) } } @@ -1058,6 +1058,7 @@ where .num_threads(num_threads) .build() .unwrap(); + let mut encodings = pool.install(|| { let result = inputs .into_maybe_par_iter() diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs index c645d0da9..54e24f76a 100644 --- a/tokenizers/src/tokenizer/pre_tokenizer.rs +++ b/tokenizers/src/tokenizer/pre_tokenizer.rs @@ -186,25 +186,6 @@ impl PreTokenizedString { } } - pub fn fast_into_encoding(self) -> Result { - if self.splits.is_empty() { - Ok(Encoding::default()) - } else if !self.splits.iter().all(|split| split.tokens.is_some()) { - Err("Split has not been tokenized.".into()) - } else { - let tokens = self - .splits - .into_iter() - .flat_map(|split| { - split.tokens.unwrap().into_iter().map(|token| { - // Replace this with the actual fields you need for the Encoding type - (token.id, String::new(), (0, 0), None, 0) - }) - }) - .collect(); - Ok(tokens) - } - } /// Returns a list of splits, each of them being a slice of the normalized /// string, the associated offsets either in original or normalized /// referential, as well as the potention tokens diff --git a/tokenizers/src/utils/parallelism.rs b/tokenizers/src/utils/parallelism.rs index a59d7102f..b955731d1 100644 --- a/tokenizers/src/utils/parallelism.rs +++ b/tokenizers/src/utils/parallelism.rs @@ -73,7 +73,7 @@ where if parallelism { USED_PARALLELISM.store(true, Ordering::SeqCst); } - CondIterator::new(self, true) + CondIterator::new(self, parallelism) } fn into_maybe_par_iter_cond(self, cond: bool) -> CondIterator { From 86f08f66f4728acc9b3fb9c366b55aa66293b227 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Wed, 24 Jul 2024 20:12:42 +0200 Subject: [PATCH 17/17] revert --- bindings/python/Cargo.toml | 4 ---- tokenizers/benches/bpe_benchmark.rs | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 376523ded..3b1b1bbf1 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -29,7 +29,3 @@ pyo3 = { version = "0.21", features = ["auto-initialize"] } [features] defaut = ["pyo3/extension-module"] - -[profile.release] -debug = true -strip = false \ No newline at end of file diff --git a/tokenizers/benches/bpe_benchmark.rs b/tokenizers/benches/bpe_benchmark.rs index 3589d449a..dd65d233e 100644 --- a/tokenizers/benches/bpe_benchmark.rs +++ b/tokenizers/benches/bpe_benchmark.rs @@ -24,8 +24,8 @@ fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer { let mut tokenizer = Tokenizer::new(bpe); tokenizer.with_pre_tokenizer(ByteLevel::default()); tokenizer.with_decoder(ByteLevel::default()); - // tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]); - // tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]); + tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]); + tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]); tokenizer }