diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 85e6e46c3..675bfefff 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -54,7 +54,7 @@ jobs: os: [ubuntu-latest, macos-latest] steps: - name: Checkout repository - uses: actions/checkout@v1 + uses: actions/checkout@v2 - name: Install Rust @@ -99,9 +99,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: audit - # ignoring specific CVE which probably isn't affecting this crate - # https://github.com/chronotope/chrono/issues/602 - args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2020-0071 --ignore RUSTSEC-2021-0145 + args: -D warnings -f ./bindings/python/Cargo.lock - name: Install working-directory: ./bindings/python diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4510370d8..8640c91d4 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -85,9 +85,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: audit - # ignoring specific CVE which probably isn't affecting this crate - # https://github.com/chronotope/chrono/issues/602 - args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2020-0071 --ignore RUSTSEC-2021-0145 + args: -D warnings -f ./tokenizers/Cargo.lock # Verify that Readme.md is up to date. - name: Make sure, Readme generated from lib.rs matches actual Readme diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 7c158c0e6..b76c68bb3 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -9,25 +9,23 @@ name = "tokenizers" crate-type = ["cdylib"] [dependencies] -rayon = "1.3" +rayon = "1.8" serde = { version = "1.0", features = [ "rc", "derive" ]} serde_json = "1.0" libc = "0.2" -env_logger = "0.7.1" +env_logger = "0.10.0" pyo3 = { version = "0.19" } numpy = "0.19.0" -ndarray = "0.13" -onig = { version = "6.0", default-features = false } -itertools = "0.9" +ndarray = "0.15" +onig = { version = "6.4", default-features = false } +itertools = "0.11" [dependencies.tokenizers] version = "0.14.1-dev.0" path = "../../tokenizers" -default-features = false -features = ["onig"] [dev-dependencies] -tempfile = "3.1" +tempfile = "3.8" pyo3 = { version = "0.19", features = ["auto-initialize"] } [features] diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 205ceb16d..729cd9062 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -45,31 +45,31 @@ harness = false [dependencies] lazy_static = "1.4" rand = "0.8" -onig = { version = "6.0", default-features = false, optional = true } -regex = "1.8" +onig = { version = "6.4", default-features = false, optional = true } +regex = "1.9" regex-syntax = "0.7" -rayon = "1.7" -rayon-cond = "0.1" +rayon = "1.8" +rayon-cond = "0.3" serde = { version = "1.0", features = [ "derive" ] } serde_json = "1.0" -clap = { version = "4.0", features=["derive"], optional = true } +clap = { version = "4.4", features=["derive"], optional = true } unicode-normalization-alignments = "0.1" unicode_categories = "0.1" unicode-segmentation = "1.10" -indicatif = {version = "0.15", optional = true} -itertools = "0.9" +indicatif = {version = "0.17", optional = true} +itertools = "0.11" log = "0.4" derive_builder = "0.12" spm_precompiled = "0.1" -hf-hub = { version = "0.2.0", optional = true } -aho-corasick = "0.7" -paste = "1.0.6" -macro_rules_attribute = "0.1.2" -thiserror = "1.0.30" -fancy-regex = { version = "0.10", optional = true} -getrandom = { version = "0.2.6" } -esaxx-rs = { version = "0.1", default-features = false, features=[]} -monostate = "0.1.5" +hf-hub = { version = "0.3.2", optional = true } +aho-corasick = "1.1" +paste = "1.0.14" +macro_rules_attribute = "0.2.0" +thiserror = "1.0.49" +fancy-regex = { version = "0.11", optional = true} +getrandom = { version = "0.2.10" } +esaxx-rs = { version = "0.1.10", default-features = false, features=[]} +monostate = "0.1.9" [features] default = ["progressbar", "cli", "onig", "esaxx_fast"] @@ -80,8 +80,8 @@ cli = ["clap"] unstable_wasm = ["fancy-regex", "getrandom/js"] [dev-dependencies] -criterion = "0.4" -tempfile = "3.1" +criterion = "0.5" +tempfile = "3.8" assert_approx_eq = "1.1" [profile.release] diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs index 88eae8ca5..0a7fa4e56 100644 --- a/tokenizers/src/models/bpe/trainer.rs +++ b/tokenizers/src/models/bpe/trainer.rs @@ -223,7 +223,8 @@ impl BpeTrainer { let p = ProgressBar::new(0); p.set_style( ProgressStyle::default_bar() - .template("[{elapsed_precise}] {msg:<40!} {wide_bar} {pos:<9!}/{len:>9!}"), + .template("[{elapsed_precise}] {msg:<30!} {wide_bar} {pos:<9!}/{len:>9!}") + .expect("Invalid progress template"), ); Some(p) } else { @@ -241,11 +242,10 @@ impl BpeTrainer { } /// Update the progress bar with the new provided length and message - fn update_progress(&self, p: &Option, len: usize, message: &str) { + fn update_progress(&self, p: &Option, len: usize, message: &'static str) { if let Some(p) = p { p.set_message(message); p.set_length(len as u64); - p.set_draw_delta(len as u64 / 100); p.reset(); } } diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs index cdddb0a71..5d178e77b 100644 --- a/tokenizers/src/models/unigram/trainer.rs +++ b/tokenizers/src/models/unigram/trainer.rs @@ -88,7 +88,8 @@ impl UnigramTrainer { let p = ProgressBar::new(0); p.set_style( ProgressStyle::default_bar() - .template("[{elapsed_precise}] {msg:<40!} {wide_bar} {pos:<9!}/{len:>9!}"), + .template("[{elapsed_precise}] {msg:<30!} {wide_bar} {pos:<9!}/{len:>9!}") + .expect("Invalid progress template"), ); Some(p) } else { @@ -431,11 +432,10 @@ impl UnigramTrainer { } /// Update the progress bar with the new provided length and message - fn update_progress(&self, p: &Option, len: usize, message: &str) { + fn update_progress(&self, p: &Option, len: usize, message: &'static str) { if let Some(p) = p { p.set_message(message); p.set_length(len as u64); - p.set_draw_delta(len as u64 / 100); p.reset(); } } diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 49870c640..6870b7ec7 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -166,10 +166,12 @@ impl AddedVocabulary { pub fn new() -> Self { let trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build::<_, &&[u8]>([]); + .build::<_, &&[u8]>([]) + .expect("The trie should build correctly"); let normalized_trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build::<_, &&[u8]>([]); + .build::<_, &&[u8]>([]) + .expect("The normalized trie should build correctly"); Self { added_tokens_map: HashMap::new(), added_tokens_map_r: HashMap::new(), @@ -314,7 +316,8 @@ impl AddedVocabulary { let (tokens, ids): (Vec<&AddedToken>, Vec) = non_normalized.into_iter().unzip(); let trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build(tokens.iter().map(|token| &token.content)); + .build(tokens.iter().map(|token| &token.content)) + .expect("Failed to build tried when refreshing tokens"); self.split_trie = (trie, ids); let (ntokens, nids): (Vec<&AddedToken>, Vec) = normalized.into_iter().unzip(); @@ -330,7 +333,8 @@ impl AddedVocabulary { .collect(); let normalized_trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build(patterns.iter().map(|content| content.get())); + .build(patterns.iter().map(|content| content.get())) + .expect("Failed to build tried when refreshing tokens (normalized)"); self.split_normalized_trie = (normalized_trie, nids); } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 915fe7bfa..2d7e10f73 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -1078,11 +1078,11 @@ where let progress = ProgressBar::new(len); progress.set_style( ProgressStyle::default_bar() - .template("[{elapsed_precise}] {msg:<40!} {wide_bar} {percent:>18!}%"), + .template("[{elapsed_precise}] {msg:<30!} {wide_bar} {percent:>18!}%") + .expect("Invalid progress template"), ); progress - .set_message(&format!("Pre-processing files ({:.2} Mo)", len / 1_000_000)); - progress.set_draw_delta(len / 100); // Redraw only every 2% + .set_message(format!("Pre-processing files ({:.2} Mo)", len / 1_000_000)); Some(progress) } else { None @@ -1131,15 +1131,10 @@ where let progress = ProgressBar::new(len); progress.set_style( ProgressStyle::default_bar() - .template("[{elapsed_precise}] {msg:<40!} {wide_bar} {pos:<9!}/{len:>9!}"), + .template("[{elapsed_precise}] {msg:<30!} {wide_bar} {pos:<9!}/{len:>9!}") + .expect("Invalid progress template"), ); progress.set_message("Pre-processing sequences"); - if len > 0 { - progress.set_draw_delta(len / 100); // Redraw only every 2% - } else { - // Trying to have a good default to avoid progress tracking being the bottleneck - progress.set_draw_delta(1000); - } Some(progress) } else { None diff --git a/tokenizers/src/utils/progress.rs b/tokenizers/src/utils/progress.rs index 9b246e807..96e9f6082 100644 --- a/tokenizers/src/utils/progress.rs +++ b/tokenizers/src/utils/progress.rs @@ -3,6 +3,7 @@ pub(crate) use indicatif::{ProgressBar, ProgressStyle}; #[cfg(not(feature = "progressbar"))] mod progressbar { + use std::borrow::Cow; pub struct ProgressBar; impl ProgressBar { pub fn new(_length: u64) -> Self { @@ -10,8 +11,7 @@ mod progressbar { } pub fn set_length(&self, _length: u64) {} - pub fn set_draw_delta(&self, _draw_delta: u64) {} - pub fn set_message(&self, _message: &str) {} + pub fn set_message(&self, _message: impl Into>) {} pub fn finish(&self) {} pub fn reset(&self) {} pub fn inc(&self, _inc: u64) {} @@ -23,8 +23,8 @@ mod progressbar { pub fn default_bar() -> Self { Self {} } - pub fn template(self, _template: &str) -> Self { - self + pub fn template(self, _template: &str) -> Result { + Ok(self) } } }