[T5Tokenizer] Fix fast and extra tokens (#27085)

* v4.35.dev.0

* nit t5fast match t5 slow
This commit is contained in:
Arthur 2023-10-27 08:18:24 +02:00 committed by GitHub
parent 6f31601687
commit aa4198a238
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -118,17 +118,19 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
**kwargs,
):
# Add extra_ids to the special token list
if extra_ids > 0 and additional_special_tokens is None:
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
elif extra_ids > 0 and additional_special_tokens is not None:
# Check that we have the right number of extra special tokens
extra_tokens = len(set(filter(lambda x: bool("extra_id_" in str(x)), additional_special_tokens)))
if extra_tokens != extra_ids:
if additional_special_tokens is not None:
extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
if len(extra_tokens) < 1:
additional_special_tokens += [f"<extra_id_{i}>" for i in range(extra_ids)]
elif extra_ids > 0 and extra_ids != len(extra_tokens):
raise ValueError(
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
" provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
" tokens"
)
else:
extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
additional_special_tokens = extra_tokens
super().__init__(
vocab_file,