🚨🚨🚨 Limit backtracking in Nougat regexp (#35264)

* Limit backtracking in regexp

* Update

* [run-slow] nougat

* Update
This commit is contained in:
Pavel Iakubovskii 2024-12-17 16:34:18 +00:00 committed by GitHub
parent d29a06e39a
commit deac971c46
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -514,7 +514,7 @@ class NougatTokenizerFast(PreTrainedTokenizerFast):
generation = generation.replace("\n* [leftmargin=*]\n", "\n")
# Remove lines with markdown headings starting with #, with numerals,
# and possibly roman numerals with trailing spaces and newlines
generation = re.sub(r"^#+ (?:\.?(?:\d|[ixv])+)*\s*(?:$|\n\s*)", "", generation, flags=re.M)
generation = re.sub(r"^#+ (?:[\d+\.]+|[ixv\.]+)?\s*(?:$|\n\s*)", "", generation, flags=re.M)
# most likely hallucinated titles
lines = generation.split("\n")
if lines[-1].startswith("#") and lines[-1].lstrip("#").startswith(" ") and len(lines) > 1: