mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
[T5Tokenizer
] Fix fast and extra tokens (#27085)
* v4.35.dev.0 * nit t5fast match t5 slow
This commit is contained in:
parent
6f31601687
commit
aa4198a238
@ -118,17 +118,19 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
|
||||
**kwargs,
|
||||
):
|
||||
# Add extra_ids to the special token list
|
||||
if extra_ids > 0 and additional_special_tokens is None:
|
||||
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
|
||||
elif extra_ids > 0 and additional_special_tokens is not None:
|
||||
# Check that we have the right number of extra special tokens
|
||||
extra_tokens = len(set(filter(lambda x: bool("extra_id_" in str(x)), additional_special_tokens)))
|
||||
if extra_tokens != extra_ids:
|
||||
if additional_special_tokens is not None:
|
||||
extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
|
||||
if len(extra_tokens) < 1:
|
||||
additional_special_tokens += [f"<extra_id_{i}>" for i in range(extra_ids)]
|
||||
elif extra_ids > 0 and extra_ids != len(extra_tokens):
|
||||
raise ValueError(
|
||||
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
|
||||
" provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
|
||||
" tokens"
|
||||
)
|
||||
else:
|
||||
extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
|
||||
additional_special_tokens = extra_tokens
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
|
Loading…
Reference in New Issue
Block a user