From a3ad85b3e83a32e5cf81b1be38eab19b2663aa74 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 26 Jul 2024 12:16:30 +0200 Subject: [PATCH] Fix clippy + feature test management. (#1580) * Fix clippy + feature test management. * That example was local oops. * CLippy fix. * Readme indentation. * README update. --- tokenizers/README.md | 6 +++--- tokenizers/src/lib.rs | 6 +++--- tokenizers/src/processors/template.rs | 5 +++-- tokenizers/src/tokenizer/mod.rs | 12 ++++-------- tokenizers/src/tokenizer/normalizer.rs | 2 ++ tokenizers/src/tokenizer/pre_tokenizer.rs | 6 +++--- tokenizers/src/tokenizer/serialization.rs | 3 +-- 7 files changed, 19 insertions(+), 21 deletions(-) diff --git a/tokenizers/README.md b/tokenizers/README.md index aacc051f1..ef014c0c9 100644 --- a/tokenizers/README.md +++ b/tokenizers/README.md @@ -128,9 +128,9 @@ fn main() -> Result<()> { ## Additional information - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined -by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS` -environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads. -**_Please note this behavior may evolve in the future_** + by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS` + environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads. + **_Please note this behavior may evolve in the future_** ## Features **progressbar**: The progress bar visualization is enabled by default. It might be disabled if diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index eb89b9315..657d810ba 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -116,9 +116,9 @@ //! # Additional information //! //! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined -//! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS` -//! environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads. -//! **_Please note this behavior may evolve in the future_** +//! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS` +//! environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads. +//! **_Please note this behavior may evolve in the future_** //! //! # Features //! **progressbar**: The progress bar visualization is enabled by default. It might be disabled if diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs index c5aaa55db..9259180d0 100644 --- a/tokenizers/src/processors/template.rs +++ b/tokenizers/src/processors/template.rs @@ -11,6 +11,7 @@ //! sequences. The final result looks like this: //! - Single sequence: `[CLS] Hello there [SEP]` //! - Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]` +//! //! With the type ids as following: //! ```markdown //! [CLS] ... [SEP] ... [SEP] @@ -75,8 +76,8 @@ pub enum Sequence { /// It can be either the input sequence or a [`SpecialToken`]: /// /// - The `Sequence` has an associated `type_id` which is used by default -/// for any token inside this sequence. The `Sequence` corresponds to one -/// of the input sequence given as input of the `PostProcessor`. +/// for any token inside this sequence. The `Sequence` corresponds to one +/// of the input sequence given as input of the `PostProcessor`. /// /// - The `SpecialToken` has an associated `id`. It corresponds to a [`SpecialToken`]. /// diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 99e2b7127..766ee1cd9 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -3,11 +3,11 @@ //! A [`Tokenizer`](struct.Tokenizer.html) is composed of some of the following parts. //! - [`Normalizer`](trait.Normalizer.html): Takes care of the text normalization (like unicode normalization). //! - [`PreTokenizer`](trait.PreTokenizer.html): Takes care of the pre tokenization (ie. How to split tokens and pre-process -//! them. +//! them. //! - [`Model`](trait.Model.html): A model encapsulates the tokenization algorithm (like BPE, Word base, character -//! based, ...). +//! based, ...). //! - [`PostProcessor`](trait.PostProcessor.html): Takes care of the processing after tokenization (like truncating, padding, -//! ...). +//! ...). use std::{ collections::HashMap, @@ -1297,17 +1297,13 @@ where #[cfg(test)] mod test { - - use crate::AddedToken; - use crate::Tokenizer; - #[cfg(feature = "http")] #[test] fn test_decoding_with_added_bpe() { use crate::{ normalizers, pre_tokenizers::split::{Split, SplitPattern}, - NormalizerWrapper, PreTokenizerWrapper, SplitDelimiterBehavior, + AddedToken, NormalizerWrapper, PreTokenizerWrapper, SplitDelimiterBehavior, Tokenizer, }; let mut tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3-8B", None).unwrap(); diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 8d5b66455..e2f501abe 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -305,6 +305,7 @@ impl NormalizedString { /// - `1` if this is a new char /// - `-N` if the char is right before N removed chars /// - `0` if the char is replacing the existing one + /// /// Since it is possible that the normalized string doesn't include some of the characters at /// the beginning of the original one, we need an `initial_offset` which represents the number /// of removed chars at the very beginning. @@ -424,6 +425,7 @@ impl NormalizedString { /// - `1` if this is a new char /// - `-N` if the char is right before N removed chars /// - `0` if the char is replacing the existing one + /// /// Since it is possible that the normalized string doesn't include some of the characters at /// the beginning of the original one, we need an `initial_offset` which represents the number /// of removed chars at the very beginning. diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs index 54e24f76a..9667c240a 100644 --- a/tokenizers/src/tokenizer/pre_tokenizer.rs +++ b/tokenizers/src/tokenizer/pre_tokenizer.rs @@ -65,9 +65,9 @@ impl PreTokenizedString { /// /// There are only one constraint that *MUST* be respected: /// > The produced `NormalizedString`, if combined back together, must have the - /// same `original` string as the original one given to `split_fn`. This concretely - /// means that for the offset tracking to work as expected, `split_fn` must produce - /// "splits" of the original string. + /// > same `original` string as the original one given to `split_fn`. This concretely + /// > means that for the offset tracking to work as expected, `split_fn` must produce + /// > "splits" of the original string. pub fn split(&mut self, mut split_fn: F) -> Result<()> where F: FnMut(usize, NormalizedString) -> Result, diff --git a/tokenizers/src/tokenizer/serialization.rs b/tokenizers/src/tokenizer/serialization.rs index c3ad5b410..db9b0a403 100644 --- a/tokenizers/src/tokenizer/serialization.rs +++ b/tokenizers/src/tokenizer/serialization.rs @@ -177,7 +177,6 @@ where mod tests { use crate::tokenizer::Tokenizer; use std::str::FromStr; - use tracing_subscriber::fmt; #[test] fn test_deserialization_serialization_invariant() { @@ -236,7 +235,7 @@ mod tests { #[cfg(feature = "http")] #[test] fn test_from_pretrained() { - fmt() + tracing_subscriber::fmt() .with_max_level(tracing::Level::DEBUG) .with_target(false) .init();