diff --git a/RELEASE.md b/RELEASE.md index 1353e6eb9..bbd6b0e78 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -86,5 +86,5 @@ Simple checklist on how to make releases for `tokenizers`. If you want to make modifications to the CI/CD of the release GH actions, you need to : - **Comment the part that uploads the artifacts** to `crates.io`, `PyPi` or `npm`. -- Change the trigger mecanism so it can trigger every time you push to your branch. +- Change the trigger mechanism so it can trigger every time you push to your branch. - Keep pushing your changes until the artifacts are properly created. diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 5d1aae0ce..5f433ac6a 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -20,12 +20,6 @@ name = "tokenizers" path = "src/lib.rs" bench = false -[[bin]] -name = "cli" -path = "src/cli.rs" -bench = false -required-features = ["cli"] - [[bench]] name = "bpe_benchmark" harness = false @@ -52,14 +46,13 @@ rayon = "1.8" rayon-cond = "0.3" serde = { version = "1.0", features = [ "derive" ] } serde_json = "1.0" -clap = { version = "4.4", features=["derive"], optional = true } unicode-normalization-alignments = "0.1" unicode_categories = "0.1" unicode-segmentation = "1.10" indicatif = {version = "0.17", optional = true} itertools = "0.12" log = "0.4" -derive_builder = "0.12" +derive_builder = "0.13" spm_precompiled = "0.1" hf-hub = { version = "0.3.2", optional = true } aho-corasick = "1.1" @@ -72,11 +65,10 @@ esaxx-rs = { version = "0.1.10", default-features = false, features=[]} monostate = "0.1.9" [features] -default = ["progressbar", "cli", "onig", "esaxx_fast"] +default = ["progressbar", "onig", "esaxx_fast"] esaxx_fast = ["esaxx-rs/cpp"] progressbar = ["indicatif"] http = ["hf-hub"] -cli = ["clap"] unstable_wasm = ["fancy-regex", "getrandom/js"] [dev-dependencies] diff --git a/tokenizers/src/cli.rs b/tokenizers/src/cli.rs deleted file mode 100644 index 54b82357f..000000000 --- a/tokenizers/src/cli.rs +++ /dev/null @@ -1,73 +0,0 @@ -//! -//! This is the CLI binary for the Tokenizers project -//! - -use clap::{Parser, Subcommand}; -use std::io::{self, BufRead, Write}; -use tokenizers::models::bpe::BPE; -use tokenizers::pre_tokenizers::byte_level::ByteLevel; -use tokenizers::tokenizer::{AddedToken, Result}; -use tokenizers::Tokenizer; - -/// Generate custom Tokenizers or use existing ones -#[derive(Parser, Debug)] -#[command(author, version)] -struct Args { - #[command(subcommand)] - command: Command, -} - -#[derive(Subcommand, Debug)] -enum Command { - Shell { - /// Path to the vocab.json file - vocab: String, - /// Path to the merges.txt file - merges: String, - }, -} - -fn shell(vocab: &str, merges: &str) -> Result<()> { - let bpe = BPE::from_file(vocab, merges).build()?; - let mut tokenizer = Tokenizer::new(bpe); - tokenizer - .with_pre_tokenizer(ByteLevel::default()) - .with_decoder(ByteLevel::default()); - - tokenizer.add_tokens(&[AddedToken::from(String::from("ing"), false).single_word(false)]); - tokenizer - .add_special_tokens(&[AddedToken::from(String::from("[ENT]"), true).single_word(true)]); - - let stdin = io::stdin(); - let mut handle = stdin.lock(); - let mut buffer = String::new(); - - loop { - buffer.clear(); - - print!("\nEnter some text to tokenize:\n> "); - io::stdout().flush()?; - handle.read_line(&mut buffer)?; - let buffer = buffer.trim_end(); - - let timer = std::time::Instant::now(); - let encoded = tokenizer.encode(buffer.to_owned(), false)?; - let elapsed = timer.elapsed(); - println!("\nInput:\t\t{}", buffer); - println!("Tokens:\t\t{:?}", encoded.get_tokens()); - println!("IDs:\t\t{:?}", encoded.get_ids()); - println!("Offsets:\t{:?}", encoded.get_offsets()); - println!( - "Decoded:\t{}", - tokenizer.decode(encoded.get_ids(), true).unwrap() - ); - println!("Tokenized in {:?}", elapsed); - } -} - -fn main() -> Result<()> { - let args = Args::parse(); - match args.command { - Command::Shell { vocab, merges } => shell(&vocab, &merges), - } -}