mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 02:31:11 +06:00
update clean_up_tokenization_spaces warning (#32371)
This commit is contained in:
parent
05c1f9af9a
commit
2229ebe722
@ -1593,6 +1593,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
|
|
||||||
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
|
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
|
||||||
|
|
||||||
|
if "clean_up_tokenization_spaces" not in kwargs:
|
||||||
|
warnings.warn(
|
||||||
|
"`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This "
|
||||||
|
"behavior will be depracted in transformers v4.45, and will be then set to `False` by default. "
|
||||||
|
"For more details check this issue: https://github.com/huggingface/transformers/issues/31884",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
# By default, cleaning tokenization spaces for both fast and slow tokenizers
|
# By default, cleaning tokenization spaces for both fast and slow tokenizers
|
||||||
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
|
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
|
||||||
|
|
||||||
|
@ -4247,52 +4247,6 @@ class TokenizerTesterMixin:
|
|||||||
# Should not raise an error
|
# Should not raise an error
|
||||||
self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
|
self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
|
||||||
|
|
||||||
# TODO This is ran for all models but only tests bert...
|
|
||||||
def test_clean_up_tokenization_spaces(self):
|
|
||||||
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
|
|
||||||
assert tokenizer.clean_up_tokenization_spaces is True
|
|
||||||
|
|
||||||
tokens = tokenizer.encode("This shouldn't be! He'll go.")
|
|
||||||
decoded = tokenizer.decode(tokens)
|
|
||||||
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
|
|
||||||
|
|
||||||
tokenizer.clean_up_tokenization_spaces = False
|
|
||||||
decoded = tokenizer.decode(tokens)
|
|
||||||
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
|
|
||||||
assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False)
|
|
||||||
|
|
||||||
# Fast from slow
|
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
|
||||||
tokenizer.save_pretrained(tmp_dir_2)
|
|
||||||
tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2)
|
|
||||||
del tokenizer
|
|
||||||
|
|
||||||
assert tokenizer_fast.clean_up_tokenization_spaces is False
|
|
||||||
decoded = tokenizer_fast.decode(tokens)
|
|
||||||
# fast and slow don't have the same output when we don't cleanup
|
|
||||||
# tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
|
|
||||||
assert decoded == "[CLS] this shouldn ' t be! he ' ll go. [SEP]"
|
|
||||||
|
|
||||||
tokenizer_fast.clean_up_tokenization_spaces = True
|
|
||||||
assert tokenizer_fast.clean_up_tokenization_spaces is True
|
|
||||||
|
|
||||||
decoded = tokenizer_fast.decode(tokens)
|
|
||||||
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
|
|
||||||
|
|
||||||
# Slow from fast
|
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
|
||||||
tokenizer_fast.clean_up_tokenization_spaces = False
|
|
||||||
tokenizer_fast.save_pretrained(tmp_dir_2)
|
|
||||||
tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)
|
|
||||||
|
|
||||||
assert tokenizer.clean_up_tokenization_spaces is False
|
|
||||||
decoded = tokenizer.decode(tokens)
|
|
||||||
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
|
|
||||||
|
|
||||||
tokenizer.clean_up_tokenization_spaces = True
|
|
||||||
decoded = tokenizer.decode(tokens)
|
|
||||||
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
|
|
||||||
|
|
||||||
def test_split_special_tokens(self):
|
def test_split_special_tokens(self):
|
||||||
if not self.test_slow_tokenizer:
|
if not self.test_slow_tokenizer:
|
||||||
self.skipTest(reason="test_slow_tokenizer is set to False")
|
self.skipTest(reason="test_slow_tokenizer is set to False")
|
||||||
|
Loading…
Reference in New Issue
Block a user