clean_up_tokenization_spaces=False if unset (#31938)

* clean_up_tokenization_spaces=False if unset

* deprecate warning

* updating param for old models

* update models

* make fix-copies

* fix-copies and update bert models

* warning msg

* update prophet and clvp

* updating test since space before is arbitrarily removed

* remove warning for 4.45
This commit is contained in:
Ita Zaporozhets 2024-09-26 13:38:20 -04:00 committed by GitHub
parent 3557f9a14a
commit 6730485b02
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 67 additions and 7 deletions

View File

@ -88,6 +88,9 @@ class BertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -105,6 +108,7 @@ class BertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
@ -136,6 +140,7 @@ class BertTokenizer(PreTrainedTokenizer):
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -91,6 +91,9 @@ class ConvBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original ConvBERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -108,6 +111,7 @@ class ConvBertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
@ -139,6 +143,7 @@ class ConvBertTokenizer(PreTrainedTokenizer):
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -90,6 +90,9 @@ class DistilBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -108,6 +111,7 @@ class DistilBertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
@ -138,6 +142,7 @@ class DistilBertTokenizer(PreTrainedTokenizer):
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -90,6 +90,9 @@ class ElectraTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original Electra).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -107,6 +110,7 @@ class ElectraTokenizer(PreTrainedTokenizer):
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
@ -138,6 +142,7 @@ class ElectraTokenizer(PreTrainedTokenizer):
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -107,6 +107,9 @@ class FunnelTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -127,6 +130,7 @@ class FunnelTokenizer(PreTrainedTokenizer):
eos_token="</s>",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
@ -159,6 +163,7 @@ class FunnelTokenizer(PreTrainedTokenizer):
eos_token=eos_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -91,6 +91,9 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original LayoutLM).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -108,6 +111,7 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
@ -139,6 +143,7 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -90,6 +90,9 @@ class LxmertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original Lxmert).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -107,6 +110,7 @@ class LxmertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
@ -138,6 +142,7 @@ class LxmertTokenizer(PreTrainedTokenizer):
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -92,6 +92,9 @@ class MobileBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original MobileBERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -109,6 +112,7 @@ class MobileBertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
@ -140,6 +144,7 @@ class MobileBertTokenizer(PreTrainedTokenizer):
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -108,6 +108,9 @@ class MPNetTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -128,6 +131,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
mask_token="<mask>",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
@ -170,6 +174,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -308,6 +308,9 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -330,6 +333,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
mask_token: Optional[str] = "[MASK]",
tokenize_chinese_chars: Optional[bool] = True,
strip_accents: Optional[bool] = None,
clean_up_tokenization_spaces: bool = True,
**kwargs,
):
if not os.path.isfile(vocab_file):
@ -360,6 +364,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -91,6 +91,9 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original SqueezeBERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -108,6 +111,7 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not os.path.isfile(vocab_file):
@ -139,6 +143,7 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -225,6 +225,9 @@ class TapasTokenizer(PreTrainedTokenizer):
Minimum length of each question in terms of tokens (will be skipped otherwise).
max_question_length (`int`, *optional*):
Maximum length of each question in terms of tokens (will be skipped otherwise).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -252,6 +255,7 @@ class TapasTokenizer(PreTrainedTokenizer):
max_question_length=None,
model_max_length: int = 512,
additional_special_tokens: Optional[List[str]] = None,
clean_up_tokenization_spaces=True,
**kwargs,
):
if not is_pandas_available():
@ -322,6 +326,7 @@ class TapasTokenizer(PreTrainedTokenizer):
max_question_length=max_question_length,
model_max_length=model_max_length,
additional_special_tokens=additional_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)

View File

@ -1622,7 +1622,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
)
# By default, cleaning tokenization spaces for both fast and slow tokenizers
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
# By default, do not split special tokens for both fast and slow tokenizers
self.split_special_tokens = kwargs.pop("split_special_tokens", False)

View File

@ -79,7 +79,7 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
output_text = "lower[SPACE]newer"
return input_text, output_text
# Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens

View File

@ -147,8 +147,8 @@ class Wav2Vec2TokenizerTest(unittest.TestCase):
batch_tokens = tokenizer.batch_decode(sample_ids)
batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True)
self.assertEqual(batch_tokens, ["HELLO<unk>!?!?$$$", "BYE BYE<unk>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!?!?", "BYE BYE"])
self.assertEqual(batch_tokens, ["HELLO<unk>!? !?$$$", "BYE BYE<unk>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!? !?", "BYE BYE"])
def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
@ -467,8 +467,8 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
batch_tokens = tokenizer.batch_decode(sample_ids)
batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True)
self.assertEqual(batch_tokens, ["HELLO<unk>!?!?<new_tokens>$$$", "BYE BYE<unk><new_tokens>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!?!?<new_tokens>", "BYE BYE<new_tokens>"])
self.assertEqual(batch_tokens, ["HELLO<unk>!? !?<new_tokens>$$$", "BYE BYE<unk><new_tokens>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!? !?<new_tokens>", "BYE BYE<new_tokens>"])
def test_special_characters_in_vocab(self):
sent = "ʈʰ æ æ̃ ˧ kʰ"

View File

@ -249,7 +249,7 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
# fmt: on
batch_tokens = tokenizer.batch_decode(sample_ids)
self.assertEqual(batch_tokens, ["k s ɾ ɾ l ɭʲ!?!? $$$", "j ð s j ð s oːɹ $$$"])
self.assertEqual(batch_tokens, ["k s ɾ ɾ l ɭʲ ! ? ! ? $$$", "j ð s j ð s oːɹ $$$"])
@staticmethod
def get_from_offsets(offsets, key):