diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 53ec0c90a..de5a9c57b 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -25,7 +25,7 @@ jobs: architecture: "x64" - name: Install dependencies - run: pip install black + run: pip install black==19.10b0 - name: Check style working-directory: ./bindings/python diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b327d9fca..003f84131 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -29,6 +29,13 @@ jobs: - if: matrix.os == 'ubuntu-latest' run: sudo chown -R $(whoami):$(id -ng) ~/.cargo/ + - name: Install cargo-readme for Ubuntu + if: matrix.os == 'ubuntu-latest' + uses: actions-rs/cargo@v1 + with: + command: install + args: cargo-readme + - name: Build uses: actions-rs/cargo@v1 with: @@ -73,3 +80,10 @@ jobs: with: command: test args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc + + # Verify that Readme.md is up to date. + - name: Make sure, Readme generated from lib.rs matches actual Readme + if: matrix.os == 'ubuntu-latest' + shell: bash + working-directory: ./tokenizers + run: cargo readme > must_match_readme.md && diff must_match_readme.md README.md \ No newline at end of file diff --git a/tokenizers/README.md b/tokenizers/README.md index 20370ffdb..4a882ae30 100644 --- a/tokenizers/README.md +++ b/tokenizers/README.md @@ -33,65 +33,75 @@ The various steps of the pipeline are: 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant that, for example, a language model would need, such as special tokens. -## Quick example +### Deserialization and tokenization example -Train and serialize a Tokenizer. +```rust +use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput}; +use tokenizers::models::bpe::BPE; +fn main() -> Result<()> { + let bpe_builder = BPE::from_files("./path/to/vocab.json", "./path/to/merges.txt"); + let bpe = bpe_builder + .dropout(0.1) + .unk_token("[UNK]".into()) + .build()?; -```Rust -use tokenizers::models::bpe::{BpeTrainerBuilder, BPE}; -use tokenizers::Result; -use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence}; -use tokenizers::pre_tokenizers::byte_level::ByteLevel; -use tokenizers::tokenizer::{AddedToken, Tokenizer, Trainer}; - -use std::path::Path; - -fn main() -> Result<()>{ - let vocab_size: usize = 100; + let mut tokenizer = Tokenizer::new(bpe); - let trainer: Box = Box::new( - BpeTrainerBuilder::new() - .show_progress(true) - .vocab_size(vocab_size) - .min_frequency(0) - .special_tokens(vec![ - AddedToken::from("", true), - AddedToken::from("", true), - AddedToken::from("", true), - AddedToken::from("", true), - AddedToken::from("", true), - ]) - .build(), - ); - - let mut tokenizer = Tokenizer::new(Box::new(BPE::default())); - tokenizer.with_normalizer(Box::new(Sequence::new(vec![ - Box::new(Strip::new(true, true)), - Box::new(NFC), - ]))); - tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default())); - - tokenizer.train(&trainer, vec!["/path/to/train.txt".to_string()])?; - tokenizer.save("/path/to/trained_tokenizer", true)?; + let encoding = tokenizer.encode("Hey there!", false)?; + println!("{:?}", encoding.get_tokens()); Ok(()) } ``` -Deserialize a pretrained Tokenizer. +### Training and serialization example -```Rust -use tokenizers::Result; -use tokenizers::tokenizer::Tokenizer; - -fn main() -> Result<()>{ +```rust +use tokenizers::decoders::DecoderWrapper; +use tokenizers::models::bpe::{BpeTrainerBuilder, BPE}; +use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence, NormalizerWrapper}; +use tokenizers::pre_tokenizers::byte_level::ByteLevel; +use tokenizers::pre_tokenizers::PreTokenizerWrapper; +use tokenizers::processors::PostProcessorWrapper; +use tokenizers::{AddedToken, Model, Result, TokenizerBuilder}; - let tokenizer = Tokenizer::from_file("/path/to/trained_tokenizer")?; +use std::path::Path; - let sample_encoding = tokenizer.encode("Huggingface", false)?; +fn main() -> Result<()> { + let vocab_size: usize = 100; - println!("{:?}", sample_encoding); + let trainer = BpeTrainerBuilder::new() + .show_progress(true) + .vocab_size(vocab_size) + .min_frequency(0) + .special_tokens(vec![ + AddedToken::from(String::from(""), true), + AddedToken::from(String::from(""), true), + AddedToken::from(String::from(""), true), + AddedToken::from(String::from(""), true), + AddedToken::from(String::from(""), true), + ]) + .build(); + + let tokenizer = TokenizerBuilder::new() + .with_model(BPE::default()) + .with_normalizer(Some(Sequence::new(vec![ + NormalizerWrapper::StripNormalizer(Strip::new(true, true)), + NormalizerWrapper::NFC(NFC), + ]))) + .with_pretokenizer(Some(PreTokenizerWrapper::ByteLevel(ByteLevel::default()))) + .with_postprocessor(Some(PostProcessorWrapper::ByteLevel(ByteLevel::default()))) + .with_decoder(Some(DecoderWrapper::ByteLevel(ByteLevel::default()))) + .build()?; + + tokenizer + .train( + &trainer, + vec!["path/to/vocab.txt".to_string()], + )? + .get_model() + .save(Path::new("result-folder"), Some("some-prefix"))?; Ok(()) } diff --git a/tokenizers/README.tpl b/tokenizers/README.tpl new file mode 100644 index 000000000..f0ce3a158 --- /dev/null +++ b/tokenizers/README.tpl @@ -0,0 +1,18 @@ +

+
+ +
+

+

+ Build + + GitHub + + + Doc + +

+
+ + +{{readme}} \ No newline at end of file diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index 918c72455..2372aa499 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -2,15 +2,13 @@ #![doc(html_favicon_url = "https://huggingface.co/favicon.ico")] #![doc(html_logo_url = "https://huggingface.co/landing/assets/huggingface_logo.svg")] -//! # Tokenizers -//! +//! The core of `tokenizers`, written in Rust. //! Provides an implementation of today's most used tokenizers, with a focus on performance and //! versatility. //! -//! ## What is a Tokenizer +//! # What is a Tokenizer //! -//! A Tokenizer works as a pipeline, it processes some raw text as input and outputs an -//! `Encoding`. +//! A Tokenizer works as a pipeline, it processes some raw text as input and outputs an `Encoding`. //! The various steps of the pipeline are: //! //! 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are @@ -22,7 +20,7 @@ //! 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant //! that, for example, a language model would need, such as special tokens. //! -//! ## Quick example +//! ## Deserialization and tokenization example //! //! ```no_run //! use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput}; @@ -43,6 +41,65 @@ //! Ok(()) //! } //! ``` +//! +//! ## Training and serialization example +//! +//! ```no_run +//! use tokenizers::decoders::DecoderWrapper; +//! use tokenizers::models::bpe::{BpeTrainerBuilder, BPE}; +//! use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence, NormalizerWrapper}; +//! use tokenizers::pre_tokenizers::byte_level::ByteLevel; +//! use tokenizers::pre_tokenizers::PreTokenizerWrapper; +//! use tokenizers::processors::PostProcessorWrapper; +//! use tokenizers::{AddedToken, Model, Result, TokenizerBuilder}; +//! +//! use std::path::Path; +//! +//! fn main() -> Result<()> { +//! let vocab_size: usize = 100; +//! +//! let trainer = BpeTrainerBuilder::new() +//! .show_progress(true) +//! .vocab_size(vocab_size) +//! .min_frequency(0) +//! .special_tokens(vec![ +//! AddedToken::from(String::from(""), true), +//! AddedToken::from(String::from(""), true), +//! AddedToken::from(String::from(""), true), +//! AddedToken::from(String::from(""), true), +//! AddedToken::from(String::from(""), true), +//! ]) +//! .build(); +//! +//! let tokenizer = TokenizerBuilder::new() +//! .with_model(BPE::default()) +//! .with_normalizer(Some(Sequence::new(vec![ +//! NormalizerWrapper::StripNormalizer(Strip::new(true, true)), +//! NormalizerWrapper::NFC(NFC), +//! ]))) +//! .with_pretokenizer(Some(PreTokenizerWrapper::ByteLevel(ByteLevel::default()))) +//! .with_postprocessor(Some(PostProcessorWrapper::ByteLevel(ByteLevel::default()))) +//! .with_decoder(Some(DecoderWrapper::ByteLevel(ByteLevel::default()))) +//! .build()?; +//! +//! tokenizer +//! .train( +//! &trainer, +//! vec!["path/to/vocab.txt".to_string()], +//! )? +//! .get_model() +//! .save(Path::new("result-folder"), Some("some-prefix"))?; +//! +//! Ok(()) +//! } +//! ``` +//! +//! # Additional information +//! +//! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined +//! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_CPUS` +//! environment variable. As an example setting `RAYON_RS_NUM_CPUS=4` will allocate a maximum of 4 threads. +//! **_Please note this behavior may evolve in the future_** #[macro_use] extern crate log;