Skip to content

Commit

Permalink
testable example docs for training-serialization (huggingface#373)
Browse files Browse the repository at this point in the history
* testable usage docs for training and serialization and reference in README.md

* Generate Readme from testable examples + template

* add up-to-date check for Readme with generated one

* try make pipeline fail by adding something to the lib.rs readme

* remove difference from lib.rs again to make pipeline pass

* fix black version

Co-authored-by: Simon Ertl <[email protected]>
  • Loading branch information
ropottnik and Simon Ertl authored Aug 31, 2020
1 parent c036cd4 commit 50ac90d
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
architecture: "x64"

- name: Install dependencies
run: pip install black
run: pip install black==19.10b0

- name: Check style
working-directory: ./bindings/python
Expand Down
14 changes: 14 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ jobs:
- if: matrix.os == 'ubuntu-latest'
run: sudo chown -R $(whoami):$(id -ng) ~/.cargo/

- name: Install cargo-readme for Ubuntu
if: matrix.os == 'ubuntu-latest'
uses: actions-rs/cargo@v1
with:
command: install
args: cargo-readme

- name: Build
uses: actions-rs/cargo@v1
with:
Expand Down Expand Up @@ -73,3 +80,10 @@ jobs:
with:
command: test
args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc

# Verify that Readme.md is up to date.
- name: Make sure, Readme generated from lib.rs matches actual Readme
if: matrix.os == 'ubuntu-latest'
shell: bash
working-directory: ./tokenizers
run: cargo readme > must_match_readme.md && diff must_match_readme.md README.md
102 changes: 56 additions & 46 deletions tokenizers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,65 +33,75 @@ The various steps of the pipeline are:
4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
that, for example, a language model would need, such as special tokens.

## Quick example
### Deserialization and tokenization example

Train and serialize a Tokenizer.
```rust
use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput};
use tokenizers::models::bpe::BPE;

fn main() -> Result<()> {
let bpe_builder = BPE::from_files("./path/to/vocab.json", "./path/to/merges.txt");
let bpe = bpe_builder
.dropout(0.1)
.unk_token("[UNK]".into())
.build()?;

```Rust
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
use tokenizers::Result;
use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence};
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
use tokenizers::tokenizer::{AddedToken, Tokenizer, Trainer};

use std::path::Path;

fn main() -> Result<()>{
let vocab_size: usize = 100;
let mut tokenizer = Tokenizer::new(bpe);

let trainer: Box<dyn Trainer> = Box::new(
BpeTrainerBuilder::new()
.show_progress(true)
.vocab_size(vocab_size)
.min_frequency(0)
.special_tokens(vec![
AddedToken::from("<s>", true),
AddedToken::from("<pad>", true),
AddedToken::from("</s>", true),
AddedToken::from("<unk>", true),
AddedToken::from("<mask>", true),
])
.build(),
);

let mut tokenizer = Tokenizer::new(Box::new(BPE::default()));
tokenizer.with_normalizer(Box::new(Sequence::new(vec![
Box::new(Strip::new(true, true)),
Box::new(NFC),
])));
tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default()));

tokenizer.train(&trainer, vec!["/path/to/train.txt".to_string()])?;
tokenizer.save("/path/to/trained_tokenizer", true)?;
let encoding = tokenizer.encode("Hey there!", false)?;
println!("{:?}", encoding.get_tokens());

Ok(())
}
```

Deserialize a pretrained Tokenizer.
### Training and serialization example

```Rust
use tokenizers::Result;
use tokenizers::tokenizer::Tokenizer;

fn main() -> Result<()>{
```rust
use tokenizers::decoders::DecoderWrapper;
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence, NormalizerWrapper};
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
use tokenizers::pre_tokenizers::PreTokenizerWrapper;
use tokenizers::processors::PostProcessorWrapper;
use tokenizers::{AddedToken, Model, Result, TokenizerBuilder};

let tokenizer = Tokenizer::from_file("/path/to/trained_tokenizer")?;
use std::path::Path;

let sample_encoding = tokenizer.encode("Huggingface", false)?;
fn main() -> Result<()> {
let vocab_size: usize = 100;

println!("{:?}", sample_encoding);
let trainer = BpeTrainerBuilder::new()
.show_progress(true)
.vocab_size(vocab_size)
.min_frequency(0)
.special_tokens(vec![
AddedToken::from(String::from("<s>"), true),
AddedToken::from(String::from("<pad>"), true),
AddedToken::from(String::from("</s>"), true),
AddedToken::from(String::from("<unk>"), true),
AddedToken::from(String::from("<mask>"), true),
])
.build();

let tokenizer = TokenizerBuilder::new()
.with_model(BPE::default())
.with_normalizer(Some(Sequence::new(vec![
NormalizerWrapper::StripNormalizer(Strip::new(true, true)),
NormalizerWrapper::NFC(NFC),
])))
.with_pretokenizer(Some(PreTokenizerWrapper::ByteLevel(ByteLevel::default())))
.with_postprocessor(Some(PostProcessorWrapper::ByteLevel(ByteLevel::default())))
.with_decoder(Some(DecoderWrapper::ByteLevel(ByteLevel::default())))
.build()?;

tokenizer
.train(
&trainer,
vec!["path/to/vocab.txt".to_string()],
)?
.get_model()
.save(Path::new("result-folder"), Some("some-prefix"))?;

Ok(())
}
Expand Down
18 changes: 18 additions & 0 deletions tokenizers/README.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<p align="center">
<br>
<img src="https://huggingface.co/landing/assets/tokenizers/tokenizers-logo.png" width="600"/>
<br>
<p>
<p align="center">
<img alt="Build" src="https://github.com/huggingface/tokenizers/workflows/Rust/badge.svg">
<a href="https://github.com/huggingface/tokenizers/blob/master/LICENSE">
<img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue">
</a>
<a href="https://docs.rs/tokenizers/">
<img alt="Doc" src="https://docs.rs/tokenizers/badge.svg">
</a>
</p>
<br>


{{readme}}
69 changes: 63 additions & 6 deletions tokenizers/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,13 @@
#![doc(html_favicon_url = "https://huggingface.co/favicon.ico")]
#![doc(html_logo_url = "https://huggingface.co/landing/assets/huggingface_logo.svg")]

//! # Tokenizers
//!
//! The core of `tokenizers`, written in Rust.
//! Provides an implementation of today's most used tokenizers, with a focus on performance and
//! versatility.
//!
//! ## What is a Tokenizer
//! # What is a Tokenizer
//!
//! A Tokenizer works as a pipeline, it processes some raw text as input and outputs an
//! `Encoding`.
//! A Tokenizer works as a pipeline, it processes some raw text as input and outputs an `Encoding`.
//! The various steps of the pipeline are:
//!
//! 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are
Expand All @@ -22,7 +20,7 @@
//! 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
//! that, for example, a language model would need, such as special tokens.
//!
//! ## Quick example
//! ## Deserialization and tokenization example
//!
//! ```no_run
//! use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput};
Expand All @@ -43,6 +41,65 @@
//! Ok(())
//! }
//! ```
//!
//! ## Training and serialization example
//!
//! ```no_run
//! use tokenizers::decoders::DecoderWrapper;
//! use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
//! use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence, NormalizerWrapper};
//! use tokenizers::pre_tokenizers::byte_level::ByteLevel;
//! use tokenizers::pre_tokenizers::PreTokenizerWrapper;
//! use tokenizers::processors::PostProcessorWrapper;
//! use tokenizers::{AddedToken, Model, Result, TokenizerBuilder};
//!
//! use std::path::Path;
//!
//! fn main() -> Result<()> {
//! let vocab_size: usize = 100;
//!
//! let trainer = BpeTrainerBuilder::new()
//! .show_progress(true)
//! .vocab_size(vocab_size)
//! .min_frequency(0)
//! .special_tokens(vec![
//! AddedToken::from(String::from("<s>"), true),
//! AddedToken::from(String::from("<pad>"), true),
//! AddedToken::from(String::from("</s>"), true),
//! AddedToken::from(String::from("<unk>"), true),
//! AddedToken::from(String::from("<mask>"), true),
//! ])
//! .build();
//!
//! let tokenizer = TokenizerBuilder::new()
//! .with_model(BPE::default())
//! .with_normalizer(Some(Sequence::new(vec![
//! NormalizerWrapper::StripNormalizer(Strip::new(true, true)),
//! NormalizerWrapper::NFC(NFC),
//! ])))
//! .with_pretokenizer(Some(PreTokenizerWrapper::ByteLevel(ByteLevel::default())))
//! .with_postprocessor(Some(PostProcessorWrapper::ByteLevel(ByteLevel::default())))
//! .with_decoder(Some(DecoderWrapper::ByteLevel(ByteLevel::default())))
//! .build()?;
//!
//! tokenizer
//! .train(
//! &trainer,
//! vec!["path/to/vocab.txt".to_string()],
//! )?
//! .get_model()
//! .save(Path::new("result-folder"), Some("some-prefix"))?;
//!
//! Ok(())
//! }
//! ```
//!
//! # Additional information
//!
//! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
//! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_CPUS`
//! environment variable. As an example setting `RAYON_RS_NUM_CPUS=4` will allocate a maximum of 4 threads.
//! **_Please note this behavior may evolve in the future_**
#[macro_use]
extern crate log;
Expand Down

0 comments on commit 50ac90d

Please sign in to comment.