Skip to content

Commit

Permalink
Revert "[BREAKING CHANGE] Ignore added_tokens (both special and not) …
Browse files Browse the repository at this point in the history
…in the decoder (#1513)"

This reverts commit 25aee8b.
  • Loading branch information
ArthurZucker committed Jul 12, 2024
1 parent fdd26ba commit 3eed134
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 51 deletions.
6 changes: 0 additions & 6 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,6 @@ jobs:
toolchain: stable
components: rustfmt, clippy

- name: Install audit
uses: actions-rs/cargo@v1
with:
command: install
args: cargo-audit

- name: Install Python
uses: actions/setup-python@v4
with:
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,6 @@ jobs:
command: install
args: cargo-readme

- name: Install audit
uses: actions-rs/cargo@v1
with:
command: install
args: cargo-audit

- name: Build
uses: actions-rs/cargo@v1
with:
Expand Down
8 changes: 0 additions & 8 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,21 +216,13 @@ impl AddedVocabulary {
}

/// Get the token matching the given id if it exists
#[deprecated(
since = "0.19.0",
note = "please use `added_vocabulary.simple_id_to_token(id).or_else(|| model.id_to_token(id)` instead"
)]
pub fn id_to_token(&self, id: u32, model: &impl Model) -> Option<String> {
self.added_tokens_map_r
.get(&id)
.map(|t| t.content.clone())
.or_else(|| model.id_to_token(id))
}

pub fn simple_id_to_token(&self, id: u32) -> Option<String> {
self.added_tokens_map_r.get(&id).map(|t| t.content.clone())
}

//
pub fn set_encode_special_tokens(&mut self, value: bool) {
self.encode_special_tokens = value;
Expand Down
47 changes: 16 additions & 31 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -699,9 +699,7 @@ where

/// Converts an id to the corresponding token.
pub fn id_to_token(&self, id: u32) -> Option<String> {
self.added_vocabulary
.simple_id_to_token(id)
.or_else(|| self.model.id_to_token(id))
self.added_vocabulary.id_to_token(id, &self.model)
}

/// set the added bocab's splitting scheme
Expand Down Expand Up @@ -847,35 +845,22 @@ where

/// Decode the given ids, back to a String
pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> {
let mut result = String::with_capacity(ids.len());
let mut chunks = Vec::with_capacity(ids.len());
for id in ids {
if let Some(added_token) = self.added_vocabulary.simple_id_to_token(*id) {
if skip_special_tokens && self.added_vocabulary.is_special_token(&added_token) {
continue;
}
let text_chunk = if let Some(decoder) = &self.decoder {
decoder.decode(chunks.clone())?
} else {
chunks.join(" ")
};
result.push_str(&text_chunk);
if !result.is_empty() && self.decoder.is_none() {
result.push(' ');
}
result.push_str(&added_token);
chunks.clear();
} else if let Some(token) = self.model.id_to_token(*id) {
chunks.push(token);
}
}
let text_chunk = if let Some(decoder) = &self.decoder {
decoder.decode(chunks.clone())?
let tokens = ids
.iter()
.filter_map(|id| {
self.added_vocabulary
.id_to_token(*id, &self.model)
.filter(|token| {
!skip_special_tokens || !self.added_vocabulary.is_special_token(token)
})
})
.collect::<Vec<_>>();

if let Some(decoder) = &self.decoder {
decoder.decode(tokens)
} else {
chunks.join(" ")
};
result.push_str(&text_chunk);
Ok(result)
Ok(tokens.join(" "))
}
}
}

Expand Down

0 comments on commit 3eed134

Please sign in to comment.