diff --git a/src/tokenizers.js b/src/tokenizers.js index 5ef33f845..244690139 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -56,13 +56,41 @@ async function loadTokenizer(pretrained_model_name_or_path, options) { return info; } + +/** + * Helper function to split a string on a regex, but keep the delimiters. + * This is required, because the JavaScript `.split()` method does not keep the delimiters, + * and wrapping in a capturing group causes issues with existing capturing groups (due to nesting). + * @param {string} text The text to split. + * @param {RegExp} regex The regex to split on. + * @returns {string[]} The split string. + */ +function regexSplit(text, regex) { + const result = []; + let prev = 0; + for (const match of text.matchAll(regex)) { + const fullMatch = match[0]; + if (prev < match.index) { + result.push(text.slice(prev, match.index)); + } + if (fullMatch.length > 0) { + result.push(fullMatch); + } + prev = match.index + fullMatch.length; + } + if (prev < text.length) { + result.push(text.slice(prev)); + } + return result; +} + + /** * Helper method to construct a pattern from a config object. * @param {Object} pattern The pattern object. - * @param {boolean} invert Whether to invert the pattern (only applicable for Regex patterns). * @returns {RegExp|string|null} The compiled pattern. */ -function createPattern(pattern, invert = true) { +function createPattern(pattern) { if (pattern.Regex !== undefined) { // In certain cases, the pattern may contain unnecessary escape sequences (e.g., \# or \& or \~). @@ -70,12 +98,8 @@ function createPattern(pattern, invert = true) { // This isn't an issue when creating the regex w/o the 'u' flag, but it is when the 'u' flag is used. // For this reason, it is necessary to remove these backslashes before creating the regex. // See https://stackoverflow.com/a/63007777/13989043 for more information - const regex = pattern.Regex - .replace(/\\([#&~])/g, '$1') // TODO: add more characters to this list if necessary - .replace(/\\(\d+)/g, '$$$1') // Python uses \1, \2, etc. for group substitutions, but JavaScript uses $1, $2, etc. - - // NOTE: if invert is true, we wrap the pattern in a group so that it is kept when performing .split() - return new RegExp(invert ? regex : `(${regex})`, 'gu'); + const regex = pattern.Regex.replace(/\\([#&~])/g, '$1'); // TODO: add more characters to this list if necessary + return new RegExp(regex, 'gu'); } else if (pattern.String !== undefined) { return pattern.String; @@ -1317,6 +1341,8 @@ class SplitPreTokenizer extends PreTokenizer { if (this.config.invert) { return text.match(this.pattern) || []; + } else if (this.pattern instanceof RegExp) { + return regexSplit(text, this.pattern); } else { return text.split(this.pattern).filter(x => x); }