diff --git a/bindings/node/Cargo.toml b/bindings/node/Cargo.toml index f015a03e0..fc180139c 100644 --- a/bindings/node/Cargo.toml +++ b/bindings/node/Cargo.toml @@ -2,7 +2,7 @@ authors = ["Nicolas Patry "] edition = "2021" name = "node" -version = "0.14.1-dev.0" +version = "0.14.2-dev.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/bindings/node/index.d.ts b/bindings/node/index.d.ts index c18b7345e..5e9831978 100644 --- a/bindings/node/index.d.ts +++ b/bindings/node/index.d.ts @@ -175,7 +175,7 @@ export class Encoding { getSequenceIds(): Array tokenToSequence(token: number): number | null } -export class Model { } +export class Model {} export type Bpe = BPE export class BPE { static empty(): Model @@ -204,7 +204,7 @@ export class Normalizer { export class PreTokenizer { preTokenizeString(sequence: string): [string, [number, number]][] } -export class Processor { } +export class Processor {} export class AddedToken { constructor(token: string, isSpecial: boolean, options?: AddedTokenOptions | undefined | null) getContent(): string @@ -229,7 +229,6 @@ export class Tokenizer { decodeBatch(ids: Array>, skipSpecialTokens: boolean): Promise static fromString(s: string): Tokenizer static fromFile(file: string): Tokenizer - // static fromPretrained(file: string, parameters?: JsFromPretrainedParameters | undefined | null): Tokenizer addSpecialTokens(tokens: Array): void setTruncation(maxLength: number, options?: TruncationOptions | undefined | null): void disableTruncation(): void @@ -251,4 +250,4 @@ export class Tokenizer { addSpecialTokens?: boolean | undefined | null, ): Encoding } -export class Trainer { } +export class Trainer {} diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index b76c68bb3..cf9cd5a0f 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tokenizers-python" -version = "0.14.1-dev.0" +version = "0.14.2-dev.0" authors = ["Anthony MOI "] edition = "2021" @@ -21,7 +21,7 @@ onig = { version = "6.4", default-features = false } itertools = "0.11" [dependencies.tokenizers] -version = "0.14.1-dev.0" +version = "0.14.2-dev.0" path = "../../tokenizers" [dev-dependencies] diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 729cd9062..8343267d5 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -2,7 +2,7 @@ authors = ["Anthony MOI ", "Nicolas Patry "] edition = "2018" name = "tokenizers" -version = "0.14.1-dev.0" +version = "0.14.2-dev.0" homepage = "https://github.com/huggingface/tokenizers" repository = "https://github.com/huggingface/tokenizers" documentation = "https://docs.rs/tokenizers/" diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs index 0a7fa4e56..3821cdab4 100644 --- a/tokenizers/src/models/bpe/trainer.rs +++ b/tokenizers/src/models/bpe/trainer.rs @@ -21,17 +21,17 @@ impl PartialEq for Merge { } impl PartialOrd for Merge { fn partial_cmp(&self, other: &Self) -> Option { - if self.count != other.count { - Some(self.count.cmp(&other.count)) - } else { - // Here we want ascending order - Some(other.pair.cmp(&self.pair)) - } + Some(self.cmp(other)) } } impl Ord for Merge { fn cmp(&self, other: &Self) -> Ordering { - self.partial_cmp(other).unwrap() + if self.count != other.count { + self.count.cmp(&other.count) + } else { + // Here we want ascending order + other.pair.cmp(&self.pair) + } } } @@ -533,15 +533,16 @@ impl BpeTrainer { let changes = top .pos .maybe_par_iter() - .flat_map(|i| { - let w = &words[*i] as *const _ as *mut _; + .flat_map(|&i| { + let word = &words[i] as *const _ as *mut Word; // We can merge each of these words in parallel here because each position // can be there only once (HashSet). So this is safe. unsafe { - let word: &mut Word = &mut (*w); - word.merge(top.pair.0, top.pair.1, new_token_id, max_token_length) + // let word: &mut Word = &mut (*word); + (*word) + .merge(top.pair.0, top.pair.1, new_token_id, max_token_length) .into_iter() - .map(|c| (c, *i)) + .map(|c| (c, i)) .collect::>() } }) diff --git a/tokenizers/src/models/bpe/word.rs b/tokenizers/src/models/bpe/word.rs index 37005eca6..6fc8033e3 100644 --- a/tokenizers/src/models/bpe/word.rs +++ b/tokenizers/src/models/bpe/word.rs @@ -20,17 +20,17 @@ impl PartialOrd for Merge { fn partial_cmp(&self, other: &Self) -> Option { // By manually implementing this, we make the containing BinaryHeap a // min-heap ordered first on the rank, and the pos otherwise - if self.rank != other.rank { - Some(other.rank.cmp(&self.rank)) - } else { - Some(other.pos.cmp(&self.pos)) - } + Some(self.cmp(other)) } } impl Ord for Merge { fn cmp(&self, other: &Self) -> Ordering { - self.partial_cmp(other).unwrap() + if self.rank != other.rank { + other.rank.cmp(&self.rank) + } else { + other.pos.cmp(&self.pos) + } } } diff --git a/tokenizers/src/models/unigram/trie.rs b/tokenizers/src/models/unigram/trie.rs index 74f9c60bf..2f94b1766 100644 --- a/tokenizers/src/models/unigram/trie.rs +++ b/tokenizers/src/models/unigram/trie.rs @@ -25,7 +25,7 @@ impl Trie