Tokenizer fast save (#11234)

* Save fast tokenizers in both formats

* Fix for HerBERT

* Proper fix

* Properly test new behavior
This commit is contained in:
Sylvain Gugger 2021-04-15 09:32:32 -04:00 committed by GitHub
parent 6e1ee47b36
commit 2550b41aa2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 83 additions and 29 deletions

View File

@ -58,7 +58,7 @@ class HerbertTokenizer(XLMTokenizer):
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, **kwargs): def __init__(self, *args, **kwargs):
kwargs["cls_token"] = "<s>" kwargs["cls_token"] = "<s>"
kwargs["unk_token"] = "<unk>" kwargs["unk_token"] = "<unk>"
@ -68,7 +68,7 @@ class HerbertTokenizer(XLMTokenizer):
kwargs["do_lowercase_and_remove_accent"] = False kwargs["do_lowercase_and_remove_accent"] = False
kwargs["additional_special_tokens"] = [] kwargs["additional_special_tokens"] = []
super().__init__(**kwargs) super().__init__(*args, **kwargs)
self.bert_pre_tokenizer = BasicTokenizer( self.bert_pre_tokenizer = BasicTokenizer(
do_lower_case=False, never_split=self.all_special_tokens, tokenize_chinese_chars=False, strip_accents=False do_lower_case=False, never_split=self.all_special_tokens, tokenize_chinese_chars=False, strip_accents=False
) )

View File

@ -1818,10 +1818,22 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1])) added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
for token, index in added_tok_encoder_sorted: for token, index in added_tok_encoder_sorted:
assert index == len(tokenizer), ( if has_tokenizer_file and index != len(tokenizer) and tokenizer.convert_tokens_to_ids(token) != index:
f"Non-consecutive added token '{token}' found. " # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the
f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary." # index is the current length of the tokenizer (not in vocabulary)
) raise ValueError(
f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
f"{index}."
)
elif not has_tokenizer_file and index != len(tokenizer):
# Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
# current length of the tokenizer.
raise ValueError(
f"Non-consecutive added token '{token}' found. "
f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
)
# Safe to call on a tokenizer fast even if token already there.
tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens)) tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
# Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
@ -1836,7 +1848,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def save_pretrained( def save_pretrained(
self, self,
save_directory: Union[str, os.PathLike], save_directory: Union[str, os.PathLike],
legacy_format: bool = True, legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None, filename_prefix: Optional[str] = None,
) -> Tuple[str]: ) -> Tuple[str]:
""" """
@ -1844,13 +1856,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
This method make sure the full tokenizer can then be re-loaded using the This method make sure the full tokenizer can then be re-loaded using the
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method. :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method..
.. Note::
A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with this method will
not be possible to load back in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer`
instance. It can only be loaded in a "fast" tokenizer, i.e. in a
:class:`transformers.PreTrainedTokenizerFast` instance.
.. Warning:: .. Warning::
This won't save modifications you may have applied to the tokenizer after the instantiation (for instance, This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
@ -1858,11 +1864,16 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Args: Args:
save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved. save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved.
legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`): legacy_format (:obj:`bool`, `optional`):
Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only format as well as in legacy format, i.e. with tokenizer specific vocabulary and a separate added_tokens
possible to save a Fast tokenizer in the unified JSON format and this format is incompatible with files.
"slow" tokenizers (not powered by the `tokenizers` library).
If :obj:`False`, will only save the tokenizer in the unified JSON format. This format is incompatible
with "slow" tokenizers (not powered by the `tokenizers` library), so the tokenizer will not be able to
be loaded in the corresponding "slow" tokenizer.
If :obj:`True`, will save the tokenizer in legacy format.
filename_prefix: (:obj:`str`, `optional`): filename_prefix: (:obj:`str`, `optional`):
A prefix to add to the names of the files saved by the tokenizer. A prefix to add to the names of the files saved by the tokenizer.
@ -1925,7 +1936,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
self, self,
save_directory: Union[str, os.PathLike], save_directory: Union[str, os.PathLike],
file_names: Tuple[str], file_names: Tuple[str],
legacy_format: bool = True, legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None, filename_prefix: Optional[str] = None,
) -> Tuple[str]: ) -> Tuple[str]:
""" """
@ -1934,7 +1945,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained` specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
""" """
if not legacy_format: if legacy_format is False:
raise ValueError( raise ValueError(
"Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format." "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
) )

View File

@ -516,18 +516,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
self, self,
save_directory: Union[str, os.PathLike], save_directory: Union[str, os.PathLike],
file_names: Tuple[str], file_names: Tuple[str],
legacy_format: bool = True, legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None, filename_prefix: Optional[str] = None,
) -> Tuple[str]: ) -> Tuple[str]:
""" """
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens. Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well asin a unique JSON
file containing {config + vocab + added-tokens}.
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
specific :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained`
""" """
save_directory = str(save_directory) save_directory = str(save_directory)
if legacy_format: save_slow = legacy_format is None or legacy_format is True
save_fast = legacy_format is None or legacy_format is False
if save_slow:
added_tokens_file = os.path.join( added_tokens_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
) )
@ -539,7 +540,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
file_names = file_names + vocab_files + (added_tokens_file,) file_names = file_names + vocab_files + (added_tokens_file,)
else:
if save_fast:
tokenizer_file = os.path.join( tokenizer_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
) )

View File

@ -2729,7 +2729,10 @@ class TokenizerTesterMixin:
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files
# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way # Checks everything loads correctly in the same way
@ -2744,6 +2747,44 @@ class TokenizerTesterMixin:
shutil.rmtree(tmpdirname2) shutil.rmtree(tmpdirname2)
# Save tokenizer rust, legacy_format=True
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
shutil.rmtree(tmpdirname2)
# Save tokenizer rust, legacy_format=False
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it saved the tokenizer.json file
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
shutil.rmtree(tmpdirname2)
def test_embeded_special_tokens(self): def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):