Skip to content

Commit

Permalink
Add quick doc to byte_level.rs (#1420)
Browse files Browse the repository at this point in the history
* Add quick doc to byte_level.rs

* Address PR comments
  • Loading branch information
steventrouble authored Jan 3, 2024
1 parent 1146259 commit f1c23b8
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions tokenizers/src/pre_tokenizers/byte_level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ use crate::tokenizer::{
};
use crate::utils::macro_rules_attribute;

/// Converts bytes to unicode characters.
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
fn bytes_char() -> HashMap<u8, char> {
let mut bs: Vec<u8> = vec![];
bs.extend(b'!'..=b'~');
Expand All @@ -33,6 +35,8 @@ fn bytes_char() -> HashMap<u8, char> {
}

lazy_static! {
/// Regex that matches exactly one token.
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
static ref RE: SysRegex = SysRegex::new(
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
)
Expand Down

0 comments on commit f1c23b8

Please sign in to comment.