[T5Tokenizer] Fix fast and extra tokens (#27085)

* v4.35.dev.0 * nit t5fast match t5 slow
2025-07-31 02:02:21 +06:00 · 2023-10-27 08:18:24 +02:00 · 2023-10-27 08:18:24 +02:00 · aa4198a238
commit aa4198a238
parent 6f31601687
1 changed files with 8 additions and 6 deletions
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@ -118,17 +118,19 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
        **kwargs,
    ):
        # Add extra_ids to the special token list
-        if extra_ids > 0 and additional_special_tokens is None:
-            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
-        elif extra_ids > 0 and additional_special_tokens is not None:
-            # Check that we have the right number of extra special tokens
-            extra_tokens = len(set(filter(lambda x: bool("extra_id_" in str(x)), additional_special_tokens)))
-            if extra_tokens != extra_ids:
+        if additional_special_tokens is not None:
+            extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
+            if len(extra_tokens) < 1:
+                additional_special_tokens += [f"<extra_id_{i}>" for i in range(extra_ids)]
+            elif extra_ids > 0 and extra_ids != len(extra_tokens):
                raise ValueError(
                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
                    " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
                    " tokens"
                )
+        else:
+            extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
+            additional_special_tokens = extra_tokens

        super().__init__(
            vocab_file,