mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 02:31:11 +06:00
Tokenizer fast save (#11234)
* Save fast tokenizers in both formats * Fix for HerBERT * Proper fix * Properly test new behavior
This commit is contained in:
parent
6e1ee47b36
commit
2550b41aa2
@ -58,7 +58,7 @@ class HerbertTokenizer(XLMTokenizer):
|
|||||||
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
|
||||||
kwargs["cls_token"] = "<s>"
|
kwargs["cls_token"] = "<s>"
|
||||||
kwargs["unk_token"] = "<unk>"
|
kwargs["unk_token"] = "<unk>"
|
||||||
@ -68,7 +68,7 @@ class HerbertTokenizer(XLMTokenizer):
|
|||||||
kwargs["do_lowercase_and_remove_accent"] = False
|
kwargs["do_lowercase_and_remove_accent"] = False
|
||||||
kwargs["additional_special_tokens"] = []
|
kwargs["additional_special_tokens"] = []
|
||||||
|
|
||||||
super().__init__(**kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.bert_pre_tokenizer = BasicTokenizer(
|
self.bert_pre_tokenizer = BasicTokenizer(
|
||||||
do_lower_case=False, never_split=self.all_special_tokens, tokenize_chinese_chars=False, strip_accents=False
|
do_lower_case=False, never_split=self.all_special_tokens, tokenize_chinese_chars=False, strip_accents=False
|
||||||
)
|
)
|
||||||
|
@ -1818,10 +1818,22 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
|
added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
|
||||||
|
|
||||||
for token, index in added_tok_encoder_sorted:
|
for token, index in added_tok_encoder_sorted:
|
||||||
assert index == len(tokenizer), (
|
if has_tokenizer_file and index != len(tokenizer) and tokenizer.convert_tokens_to_ids(token) != index:
|
||||||
f"Non-consecutive added token '{token}' found. "
|
# Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the
|
||||||
f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
|
# index is the current length of the tokenizer (not in vocabulary)
|
||||||
)
|
raise ValueError(
|
||||||
|
f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
|
||||||
|
f"{index}."
|
||||||
|
)
|
||||||
|
elif not has_tokenizer_file and index != len(tokenizer):
|
||||||
|
# Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
|
||||||
|
# current length of the tokenizer.
|
||||||
|
raise ValueError(
|
||||||
|
f"Non-consecutive added token '{token}' found. "
|
||||||
|
f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Safe to call on a tokenizer fast even if token already there.
|
||||||
tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
|
tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
|
||||||
|
|
||||||
# Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
|
# Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
|
||||||
@ -1836,7 +1848,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
def save_pretrained(
|
def save_pretrained(
|
||||||
self,
|
self,
|
||||||
save_directory: Union[str, os.PathLike],
|
save_directory: Union[str, os.PathLike],
|
||||||
legacy_format: bool = True,
|
legacy_format: Optional[bool] = None,
|
||||||
filename_prefix: Optional[str] = None,
|
filename_prefix: Optional[str] = None,
|
||||||
) -> Tuple[str]:
|
) -> Tuple[str]:
|
||||||
"""
|
"""
|
||||||
@ -1844,13 +1856,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
|
|
||||||
|
|
||||||
This method make sure the full tokenizer can then be re-loaded using the
|
This method make sure the full tokenizer can then be re-loaded using the
|
||||||
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.
|
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method..
|
||||||
|
|
||||||
.. Note::
|
|
||||||
A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with this method will
|
|
||||||
not be possible to load back in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer`
|
|
||||||
instance. It can only be loaded in a "fast" tokenizer, i.e. in a
|
|
||||||
:class:`transformers.PreTrainedTokenizerFast` instance.
|
|
||||||
|
|
||||||
.. Warning::
|
.. Warning::
|
||||||
This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
|
This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
|
||||||
@ -1858,11 +1864,16 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved.
|
save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved.
|
||||||
legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
legacy_format (:obj:`bool`, `optional`):
|
||||||
Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a
|
Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
|
||||||
separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only
|
format as well as in legacy format, i.e. with tokenizer specific vocabulary and a separate added_tokens
|
||||||
possible to save a Fast tokenizer in the unified JSON format and this format is incompatible with
|
files.
|
||||||
"slow" tokenizers (not powered by the `tokenizers` library).
|
|
||||||
|
If :obj:`False`, will only save the tokenizer in the unified JSON format. This format is incompatible
|
||||||
|
with "slow" tokenizers (not powered by the `tokenizers` library), so the tokenizer will not be able to
|
||||||
|
be loaded in the corresponding "slow" tokenizer.
|
||||||
|
|
||||||
|
If :obj:`True`, will save the tokenizer in legacy format.
|
||||||
filename_prefix: (:obj:`str`, `optional`):
|
filename_prefix: (:obj:`str`, `optional`):
|
||||||
A prefix to add to the names of the files saved by the tokenizer.
|
A prefix to add to the names of the files saved by the tokenizer.
|
||||||
|
|
||||||
@ -1925,7 +1936,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
self,
|
self,
|
||||||
save_directory: Union[str, os.PathLike],
|
save_directory: Union[str, os.PathLike],
|
||||||
file_names: Tuple[str],
|
file_names: Tuple[str],
|
||||||
legacy_format: bool = True,
|
legacy_format: Optional[bool] = None,
|
||||||
filename_prefix: Optional[str] = None,
|
filename_prefix: Optional[str] = None,
|
||||||
) -> Tuple[str]:
|
) -> Tuple[str]:
|
||||||
"""
|
"""
|
||||||
@ -1934,7 +1945,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
|
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
|
||||||
specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
|
specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
|
||||||
"""
|
"""
|
||||||
if not legacy_format:
|
if legacy_format is False:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
|
"Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
|
||||||
)
|
)
|
||||||
|
@ -516,18 +516,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
self,
|
self,
|
||||||
save_directory: Union[str, os.PathLike],
|
save_directory: Union[str, os.PathLike],
|
||||||
file_names: Tuple[str],
|
file_names: Tuple[str],
|
||||||
legacy_format: bool = True,
|
legacy_format: Optional[bool] = None,
|
||||||
filename_prefix: Optional[str] = None,
|
filename_prefix: Optional[str] = None,
|
||||||
) -> Tuple[str]:
|
) -> Tuple[str]:
|
||||||
"""
|
"""
|
||||||
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
|
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well asin a unique JSON
|
||||||
|
file containing {config + vocab + added-tokens}.
|
||||||
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
|
|
||||||
specific :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained`
|
|
||||||
"""
|
"""
|
||||||
save_directory = str(save_directory)
|
save_directory = str(save_directory)
|
||||||
|
|
||||||
if legacy_format:
|
save_slow = legacy_format is None or legacy_format is True
|
||||||
|
save_fast = legacy_format is None or legacy_format is False
|
||||||
|
|
||||||
|
if save_slow:
|
||||||
added_tokens_file = os.path.join(
|
added_tokens_file = os.path.join(
|
||||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
|
||||||
)
|
)
|
||||||
@ -539,7 +540,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
|
|
||||||
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
|
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
|
||||||
file_names = file_names + vocab_files + (added_tokens_file,)
|
file_names = file_names + vocab_files + (added_tokens_file,)
|
||||||
else:
|
|
||||||
|
if save_fast:
|
||||||
tokenizer_file = os.path.join(
|
tokenizer_file = os.path.join(
|
||||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
|
||||||
)
|
)
|
||||||
|
@ -2729,7 +2729,10 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
|
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
|
||||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||||
# Checks it save with the same files
|
|
||||||
|
# Checks it save with the same files + the tokenizer.json file for the fast one
|
||||||
|
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||||
|
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
|
||||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||||
|
|
||||||
# Checks everything loads correctly in the same way
|
# Checks everything loads correctly in the same way
|
||||||
@ -2744,6 +2747,44 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
shutil.rmtree(tmpdirname2)
|
shutil.rmtree(tmpdirname2)
|
||||||
|
|
||||||
|
# Save tokenizer rust, legacy_format=True
|
||||||
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
|
||||||
|
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||||
|
|
||||||
|
# Checks it save with the same files
|
||||||
|
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||||
|
|
||||||
|
# Checks everything loads correctly in the same way
|
||||||
|
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||||
|
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||||
|
|
||||||
|
# Check special tokens are set accordingly on Rust and Python
|
||||||
|
for key in tokenizer_pp.special_tokens_map:
|
||||||
|
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||||
|
|
||||||
|
shutil.rmtree(tmpdirname2)
|
||||||
|
|
||||||
|
# Save tokenizer rust, legacy_format=False
|
||||||
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
|
||||||
|
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||||
|
|
||||||
|
# Checks it saved the tokenizer.json file
|
||||||
|
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||||
|
|
||||||
|
# Checks everything loads correctly in the same way
|
||||||
|
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||||
|
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||||
|
|
||||||
|
# Check special tokens are set accordingly on Rust and Python
|
||||||
|
for key in tokenizer_pp.special_tokens_map:
|
||||||
|
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||||
|
|
||||||
|
shutil.rmtree(tmpdirname2)
|
||||||
|
|
||||||
def test_embeded_special_tokens(self):
|
def test_embeded_special_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
|
Loading…
Reference in New Issue
Block a user