From deac971c469bcbb182c2e52da0b82fb3bf54cccf Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Tue, 17 Dec 2024 16:34:18 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=F0=9F=9A=A8=20Limit=20?= =?UTF-8?q?backtracking=20in=20Nougat=20regexp=20(#35264)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Limit backtracking in regexp * Update * [run-slow] nougat * Update --- src/transformers/models/nougat/tokenization_nougat_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/nougat/tokenization_nougat_fast.py b/src/transformers/models/nougat/tokenization_nougat_fast.py index 0a7eec4ad98a4c..5d0a8934c05ee1 100644 --- a/src/transformers/models/nougat/tokenization_nougat_fast.py +++ b/src/transformers/models/nougat/tokenization_nougat_fast.py @@ -514,7 +514,7 @@ def post_process_single(self, generation: str, fix_markdown: bool = True) -> str generation = generation.replace("\n* [leftmargin=*]\n", "\n") # Remove lines with markdown headings starting with #, with numerals, # and possibly roman numerals with trailing spaces and newlines - generation = re.sub(r"^#+ (?:\.?(?:\d|[ixv])+)*\s*(?:$|\n\s*)", "", generation, flags=re.M) + generation = re.sub(r"^#+ (?:[\d+\.]+|[ixv\.]+)?\s*(?:$|\n\s*)", "", generation, flags=re.M) # most likely hallucinated titles lines = generation.split("\n") if lines[-1].startswith("#") and lines[-1].lstrip("#").startswith(" ") and len(lines) > 1: