Merge branch 'huggingface:main' into main

huggingface · Jul 26, 2024 · 66b010b · 66b010b
2 parents 74265db + a3ad85b
commit 66b010b
Show file tree

Hide file tree

Showing 7 changed files with 19 additions and 21 deletions.
diff --git a/tokenizers/README.md b/tokenizers/README.md
@@ -128,9 +128,9 @@ fn main() -> Result<()> {
 ## Additional information
 
 - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
-by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
-environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
-**_Please note this behavior may evolve in the future_**
+  by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
+  environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
+  **_Please note this behavior may evolve in the future_**
 
 ## Features
 **progressbar**: The progress bar visualization is enabled by default. It might be disabled if

diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs
@@ -116,9 +116,9 @@
 //! # Additional information
 //!
 //! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
-//! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
-//! environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
-//! **_Please note this behavior may evolve in the future_**
+//!   by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
+//!   environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
+//!   **_Please note this behavior may evolve in the future_**
 //!
 //! # Features
 //! **progressbar**: The progress bar visualization is enabled by default. It might be disabled if

diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs
@@ -11,6 +11,7 @@
 //! sequences. The final result looks like this:
 //! - Single sequence: `[CLS] Hello there [SEP]`
 //! - Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]`
+//!
 //! With the type ids as following:
 //! ```markdown
 //! [CLS]   ...   [SEP]   ...   [SEP]
@@ -75,8 +76,8 @@ pub enum Sequence {
 /// It can be either the input sequence or a [`SpecialToken`]:
 ///
 /// - The `Sequence` has an associated `type_id` which is used by default
-/// for any token inside this sequence. The `Sequence` corresponds to one
-/// of the input sequence given as input of the `PostProcessor`.
+///   for any token inside this sequence. The `Sequence` corresponds to one
+///   of the input sequence given as input of the `PostProcessor`.
 ///
 /// - The `SpecialToken` has an associated `id`. It corresponds to a [`SpecialToken`].
 ///

diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -3,11 +3,11 @@
 //! A [`Tokenizer`](struct.Tokenizer.html) is composed of some of the following parts.
 //!   - [`Normalizer`](trait.Normalizer.html): Takes care of the text normalization (like unicode normalization).
 //!   - [`PreTokenizer`](trait.PreTokenizer.html): Takes care of the pre tokenization (ie. How to split tokens and pre-process
-//!   them.
+//!     them.
 //!   - [`Model`](trait.Model.html): A model encapsulates the tokenization algorithm (like BPE, Word base, character
-//!   based, ...).
+//!     based, ...).
 //!   - [`PostProcessor`](trait.PostProcessor.html): Takes care of the processing after tokenization (like truncating, padding,
-//!   ...).
+//!     ...).
 
 use std::{
     collections::HashMap,
@@ -1297,17 +1297,13 @@ where
 
 #[cfg(test)]
 mod test {
-
-    use crate::AddedToken;
-    use crate::Tokenizer;
-
     #[cfg(feature = "http")]
     #[test]
     fn test_decoding_with_added_bpe() {
         use crate::{
             normalizers,
             pre_tokenizers::split::{Split, SplitPattern},
-            NormalizerWrapper, PreTokenizerWrapper, SplitDelimiterBehavior,
+            AddedToken, NormalizerWrapper, PreTokenizerWrapper, SplitDelimiterBehavior, Tokenizer,
         };
 
         let mut tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3-8B", None).unwrap();

diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs
@@ -305,6 +305,7 @@ impl NormalizedString {
     ///   - `1` if this is a new char
     ///   - `-N` if the char is right before N removed chars
     ///   - `0` if the char is replacing the existing one
+    ///
     /// Since it is possible that the normalized string doesn't include some of the characters at
     /// the beginning of the original one, we need an `initial_offset` which represents the number
     /// of removed chars at the very beginning.
@@ -424,6 +425,7 @@ impl NormalizedString {
     ///   - `1` if this is a new char
     ///   - `-N` if the char is right before N removed chars
     ///   - `0` if the char is replacing the existing one
+    ///
     /// Since it is possible that the normalized string doesn't include some of the characters at
     /// the beginning of the original one, we need an `initial_offset` which represents the number
     /// of removed chars at the very beginning.

diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs
@@ -65,9 +65,9 @@ impl PreTokenizedString {
     ///
     /// There are only one constraint that *MUST* be respected:
     /// > The produced `NormalizedString`, if combined back together, must have the
-    /// same `original` string as the original one given to `split_fn`. This concretely
-    /// means that for the offset tracking to work as expected, `split_fn` must produce
-    /// "splits" of the original string.
+    /// > same `original` string as the original one given to `split_fn`. This concretely
+    /// > means that for the offset tracking to work as expected, `split_fn` must produce
+    /// > "splits" of the original string.
     pub fn split<F, U, R>(&mut self, mut split_fn: F) -> Result<()>
     where
         F: FnMut(usize, NormalizedString) -> Result<U>,

diff --git a/tokenizers/src/tokenizer/serialization.rs b/tokenizers/src/tokenizer/serialization.rs
@@ -177,7 +177,6 @@ where
 mod tests {
     use crate::tokenizer::Tokenizer;
     use std::str::FromStr;
-    use tracing_subscriber::fmt;
 
     #[test]
     fn test_deserialization_serialization_invariant() {
@@ -236,7 +235,7 @@ mod tests {
     #[cfg(feature = "http")]
     #[test]
     fn test_from_pretrained() {
-        fmt()
+        tracing_subscriber::fmt()
             .with_max_level(tracing::Level::DEBUG)
             .with_target(false)
             .init();