From 4a6addab69e57e50daeb145876b36314d7c50da4 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Thu, 25 Jan 2024 14:40:56 +0200 Subject: [PATCH] Return empty tokens array if text is empty after normalization (#535) --- src/tokenizers.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/tokenizers.js b/src/tokenizers.js index 030acfec7..8cee6f2dc 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2744,6 +2744,12 @@ export class PreTrainedTokenizer extends Callable { x = this.normalizer(x); } + // If, after normalization, this section is empty (e.g., trimming whitespace), + // we return an empty array + if (x.length === 0) { + return []; + } + const sectionTokens = (this.pre_tokenizer !== null) ? this.pre_tokenizer(x, { section_index, }) : [x];