mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 02:31:11 +06:00
[Tokenizer Serialization
] Fix the broken serialisation (#27099)
* nits * nits * actual fix * style * ze fix * fix fix fix style
This commit is contained in:
parent
f4db565b69
commit
230ac352d8
@ -145,6 +145,8 @@ class PegasusTokenizerFast(PreTrainedTokenizerFast):
|
||||
from_slow = kwargs.pop("from_slow", None)
|
||||
from_slow = from_slow or str(pad_token) != "<pad>" or str(eos_token) != "</s>" or str(unk_token) != "<unk>"
|
||||
|
||||
kwargs.pop("added_tokens_decoder", {})
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
|
@ -2235,7 +2235,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
|
||||
# allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
|
||||
# if `tokenizer_config.json` is `None`
|
||||
if "Fast" not in cls.__name__ and tokenizer_file is not None:
|
||||
if tokenizer_file is not None:
|
||||
# This is for slow so can be done before
|
||||
with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle:
|
||||
tokenizer_file_handle = json.load(tokenizer_file_handle)
|
||||
@ -2247,14 +2247,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
# end legacy
|
||||
|
||||
# Passing AddedTokens and not strings to the class to prevent it from casting the string to a different AddedToken
|
||||
# convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
|
||||
init_kwargs["added_tokens_decoder"] = added_tokens_decoder
|
||||
init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
|
||||
for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
|
||||
if added_tokens_map != {} and init_kwargs[key] is not None:
|
||||
if key != "additional_special_tokens":
|
||||
init_kwargs[key] = added_tokens_map.get(init_kwargs[key], init_kwargs[key])
|
||||
init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])
|
||||
|
||||
init_kwargs["added_tokens_decoder"] = added_tokens_decoder
|
||||
# convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
|
||||
init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
|
||||
# Instantiate the tokenizer.
|
||||
try:
|
||||
tokenizer = cls(*init_inputs, **init_kwargs)
|
||||
|
Loading…
Reference in New Issue
Block a user