huggingface · n1t0 · Nov 28, 2020 · Nov 12, 2020 · Nov 25, 2020 · Nov 25, 2020
diff --git a/bindings/node/native/src/tokenizer.rs b/bindings/node/native/src/tokenizer.rs
@@ -749,7 +749,7 @@ declare_types! {
             // train(files: string[], trainer?: Trainer)
 
             let files = cx.extract::<Vec<String>>(0)?;
-            let trainer = if let Some(val) = cx.argument_opt(1) {
+            let mut trainer = if let Some(val) = cx.argument_opt(1) {
                 let js_trainer = val.downcast::<JsTrainer>().or_throw(&mut cx)?;
                 let guard = cx.lock();
 
@@ -768,7 +768,7 @@ declare_types! {
 
             this.borrow_mut(&guard)
                 .tokenizer.write().unwrap()
-                .train(&trainer, files)
+                .train_from_files(&mut trainer, files)
                 .map_err(|e| Error(format!("{}", e)))?;
 
             Ok(cx.undefined().upcast())

diff --git a/bindings/node/native/src/trainers.rs b/bindings/node/native/src/trainers.rs
@@ -4,8 +4,7 @@ use crate::extraction::*;
 use crate::models::Model;
 use crate::tokenizer::AddedToken;
 use neon::prelude::*;
-use std::collections::HashMap;
-use std::sync::Arc;
+use std::sync::{Arc, RwLock};
 
 use tk::models::{
     bpe::BpeTrainer, unigram::UnigramTrainer, wordlevel::WordLevelTrainer,
@@ -15,13 +14,13 @@ use tk::models::{
 /// Trainer
 #[derive(Clone)]
 pub struct Trainer {
-    pub trainer: Option<Arc<TrainerWrapper>>,
+    pub trainer: Option<Arc<RwLock<TrainerWrapper>>>,
 }
 
 impl From<TrainerWrapper> for Trainer {
     fn from(trainer: TrainerWrapper) -> Self {
         Self {
-            trainer: Some(Arc::new(trainer)),
+            trainer: Some(Arc::new(RwLock::new(trainer))),
         }
     }
 }
@@ -33,20 +32,19 @@ impl tk::Trainer for Trainer {
         self.trainer
             .as_ref()
             .expect("Uninitialized Trainer")
+            .read()
+            .unwrap()
             .should_show_progress()
     }
 
-    fn train(
-        &self,
-        words: HashMap<String, u32>,
-        model: &mut Self::Model,
-    ) -> tk::Result<Vec<tk::AddedToken>> {
+    fn train(&self, model: &mut Self::Model) -> tk::Result<Vec<tk::AddedToken>> {
         let special_tokens = self
             .trainer
             .as_ref()
             .ok_or("Uninitialized Trainer")?
+            .read()
+            .unwrap()
             .train(
-                words,
                 &mut model
                     .model
                     .as_ref()
@@ -58,11 +56,18 @@ impl tk::Trainer for Trainer {
         Ok(special_tokens)
     }
 
-    fn process_tokens(&self, words: &mut HashMap<String, u32>, tokens: Vec<String>) {
+    fn feed<I, S, F>(&mut self, iterator: I, process: F) -> tk::Result<()>
+    where
+        I: Iterator<Item = S> + Send,
+        S: AsRef<str> + Send,
+        F: Fn(&str) -> tk::Result<Vec<String>> + Sync,
+    {
         self.trainer
             .as_ref()
-            .expect("Uninitialized Trainer")
-            .process_tokens(words, tokens)
+            .ok_or("Uninitialized Trainer")?
+            .write()
+            .unwrap()
+            .feed(iterator, process)
     }
 }
 
@@ -162,7 +167,7 @@ fn bpe_trainer(mut cx: FunctionContext) -> JsResult<JsTrainer> {
 
     let mut js_trainer = JsTrainer::new::<_, JsTrainer, _>(&mut cx, vec![])?;
     let guard = cx.lock();
-    js_trainer.borrow_mut(&guard).trainer = Some(Arc::new(trainer.into()));
+    js_trainer.borrow_mut(&guard).trainer = Some(Arc::new(RwLock::new(trainer.into())));
 
     Ok(js_trainer)
 }
@@ -254,7 +259,7 @@ fn wordpiece_trainer(mut cx: FunctionContext) -> JsResult<JsTrainer> {
 
     let mut js_trainer = JsTrainer::new::<_, JsTrainer, _>(&mut cx, vec![])?;
     let guard = cx.lock();
-    js_trainer.borrow_mut(&guard).trainer = Some(Arc::new(trainer.into()));
+    js_trainer.borrow_mut(&guard).trainer = Some(Arc::new(RwLock::new(trainer.into())));
 
     Ok(js_trainer)
 }
@@ -327,7 +332,7 @@ fn wordlevel_trainer(mut cx: FunctionContext) -> JsResult<JsTrainer> {
 
     let mut js_trainer = JsTrainer::new::<_, JsTrainer, _>(&mut cx, vec![])?;
     let guard = cx.lock();
-    js_trainer.borrow_mut(&guard).trainer = Some(Arc::new(trainer.into()));
+    js_trainer.borrow_mut(&guard).trainer = Some(Arc::new(RwLock::new(trainer.into())));
 
     Ok(js_trainer)
 }
@@ -424,7 +429,7 @@ fn unigram_trainer(mut cx: FunctionContext) -> JsResult<JsTrainer> {
 
     let mut js_trainer = JsTrainer::new::<_, JsTrainer, _>(&mut cx, vec![])?;
     let guard = cx.lock();
-    js_trainer.borrow_mut(&guard).trainer = Some(Arc::new(trainer.into()));
+    js_trainer.borrow_mut(&guard).trainer = Some(Arc::new(RwLock::new(trainer.into())));
 
     Ok(js_trainer)
 }

diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock
diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -18,6 +18,8 @@ pyo3 = "0.12"
 numpy = { git = "https://github.com/pyo3/rust-numpy/", rev = "e331befa27fede78d4662edf08fa0508db39be01" }
 ndarray = "0.13"
 onig = { version = "6.0", default-features = false }
+crossbeam = "0.8"
+itertools = "0.9"
 
 [dependencies.tokenizers]
 version = "*"

diff --git a/bindings/python/examples/train_with_datasets.py b/bindings/python/examples/train_with_datasets.py
@@ -0,0 +1,20 @@
+import datasets
+from tokenizers import normalizers, pre_tokenizers, Tokenizer, models, trainers
+
+# Build a tokenizer
+bpe_tokenizer = Tokenizer(models.BPE())
+bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
+bpe_tokenizer.normalizer = normalizers.Lowercase()
+
+# Initialize a dataset
+dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1")
+
+# Build an iterator over this dataset
+def batch_iterator():
+    batch_length = 1000
+    for i in range(0, len(dataset["train"]), batch_length):
+        yield dataset["train"][i : i + batch_length]["text"]
+
+
+# And finally train
+bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset["train"]))