Processor accepts any kwargs (#31889)

* accept kwargs in processors * return unused kwargs * fix tests * typo * update the other way
2025-07-30 01:32:23 +06:00 · 2024-07-11 13:20:30 +05:00 · 2024-07-11 13:20:30 +05:00 · 14d3b3f0f0
commit 14d3b3f0f0
parent a695c18649
14 changed files with 65 additions and 22 deletions
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@ -39,10 +39,11 @@ class BlipProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = []
    image_processor_class = "BlipImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")

-    def __init__(self, image_processor, tokenizer):
+    def __init__(self, image_processor, tokenizer, **kwargs):
        tokenizer.return_token_type_ids = False
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@ -39,11 +39,12 @@ class Blip2Processor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = []
    image_processor_class = "BlipImageProcessor"
    tokenizer_class = "AutoTokenizer"

    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__init__
-    def __init__(self, image_processor, tokenizer):
+    def __init__(self, image_processor, tokenizer, **kwargs):
        tokenizer.return_token_type_ids = False
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@ -322,10 +322,11 @@ class FuyuProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = []
    image_processor_class = "FuyuImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor, tokenizer):
+    def __init__(self, image_processor, tokenizer, **kwargs):
        super().__init__(image_processor=image_processor, tokenizer=tokenizer)
        self.image_processor = image_processor
        self.tokenizer = tokenizer
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@ -173,6 +173,7 @@ class IdeficsProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["image_size", "add_end_of_utterance_token"]
    image_processor_class = "IdeficsImageProcessor"
    tokenizer_class = "LlamaTokenizerFast"

--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@ -61,6 +61,7 @@ class Idefics2Processor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["image_seq_len", "chat_template"]
    image_processor_class = "Idefics2ImageProcessor"
    tokenizer_class = "AutoTokenizer"

--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@ -40,15 +40,16 @@ class InstructBlipProcessor(ProcessorMixin):
            An instance of [`BlipImageProcessor`]. The image processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
-        qformer_tokenizer (`AutoTokenizer`):
+        qformer_tokenizer (`AutoTokenizer`, *optional*):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = []
    image_processor_class = "BlipImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor, tokenizer, qformer_tokenizer):
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, **kwargs):
        super().__init__(image_processor, tokenizer)

        # add QFormer tokenizer
@ -167,7 +168,11 @@ class InstructBlipProcessor(ProcessorMixin):
    # overwrite to load the Q-Former tokenizer from a separate folder
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
+        if isinstance(processor, tuple):
+            processor = processor[0]
        qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
-        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
-        args.append(qformer_tokenizer)
-        return cls(*args)
+        processor.qformer_tokenizer = qformer_tokenizer
+        return processor
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@ -40,15 +40,16 @@ class InstructBlipVideoProcessor(ProcessorMixin):
            An instance of [`InstructBlipVideoImageProcessor`]. The image processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
-        qformer_tokenizer (`AutoTokenizer`):
+        qformer_tokenizer (`AutoTokenizer`, *optional*):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = []
    image_processor_class = "InstructBlipVideoImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor, tokenizer, qformer_tokenizer):
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, **kwargs):
        super().__init__(image_processor, tokenizer)

        # add QFormer tokenizer
@ -164,7 +165,11 @@ class InstructBlipVideoProcessor(ProcessorMixin):
    # overwrite to load the Q-Former tokenizer from a separate folder
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
+        if isinstance(processor, tuple):
+            processor = processor[0]
        qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer")
-        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
-        args.append(qformer_tokenizer)
-        return cls(*args)
+        processor.qformer_tokenizer = qformer_tokenizer
+        return processor
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@ -54,10 +54,11 @@ class Kosmos2Processor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["num_patch_index_tokens"]
    image_processor_class = "CLIPImageProcessor"
    tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")

-    def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024):
+    def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs):
        tokenizer.return_token_type_ids = False

        self.eod_token = "</doc>"
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@ -42,10 +42,11 @@ class LlavaProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None):
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
        super().__init__(image_processor, tokenizer, chat_template=chat_template)

    def __call__(
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@ -42,10 +42,11 @@ class LlavaNextProcessor(ProcessorMixin):
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None):
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
        super().__init__(image_processor, tokenizer, chat_template=chat_template)

    def __call__(
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@ -53,11 +53,12 @@ class LlavaNextVideoProcessor(ProcessorMixin):
    # video and image processor share same args, but have different processing logic
    # only image processor config is saved in the hub
    attributes = ["video_processor", "image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
    image_processor_class = "LlavaNextImageProcessor"
    video_processor_class = "LlavaNextVideoImageProcessor"
    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")

-    def __init__(self, video_processor=None, image_processor=None, tokenizer=None, chat_template=None):
+    def __init__(self, video_processor=None, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
        super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)

    def __call__(
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@ -85,9 +85,12 @@ class PaliGemmaProcessor(ProcessorMixin):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`], *optional*):
            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
    image_processor_class = "SiglipImageProcessor"
    tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")

@ -95,6 +98,8 @@ class PaliGemmaProcessor(ProcessorMixin):
        self,
        image_processor=None,
        tokenizer=None,
+        chat_template=None,
+        **kwargs,
    ):
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
@ -113,7 +118,7 @@ class PaliGemmaProcessor(ProcessorMixin):
        tokenizer.add_bos_token = False
        tokenizer.add_eos_token = False

-        super().__init__(image_processor, tokenizer)
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)

    def __call__(
        self,
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@ -37,14 +37,17 @@ class VideoLlavaProcessor(ProcessorMixin):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`], *optional*):
            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
    """

    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
    image_processor_class = "VideoLlavaImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor=None, tokenizer=None):
-        super().__init__(image_processor, tokenizer)
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)

    def __call__(
        self,
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@ -320,6 +320,7 @@ class ProcessorMixin(PushToHubMixin):
    feature_extractor_class = None
    tokenizer_class = None
    _auto_class = None
+    valid_kwargs: List[str] = []

    # args have to match the attributes class attribute
    def __init__(self, *args, **kwargs):
@ -648,14 +649,15 @@ class ProcessorMixin(PushToHubMixin):
        processor_dict = processor_dict.copy()
        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)

-        # Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`.
-        # We have to pop up some unused (but specific) arguments to make it work.
+        # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
+        # If we don't pop, some specific kwargs will raise a warning
        if "processor_class" in processor_dict:
            del processor_dict["processor_class"]

        if "auto_map" in processor_dict:
            del processor_dict["auto_map"]

+        unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs)
        processor = cls(*args, **processor_dict)

        # Update processor with kwargs if needed
@ -663,6 +665,7 @@ class ProcessorMixin(PushToHubMixin):
            if hasattr(processor, key):
                setattr(processor, key, kwargs.pop(key))

+        kwargs.update(unused_kwargs)
        logger.info(f"Processor {processor}")
        if return_unused_kwargs:
            return processor, kwargs
@ -887,6 +890,19 @@ class ProcessorMixin(PushToHubMixin):
        first_attribute = getattr(self, self.attributes[0])
        return getattr(first_attribute, "model_input_names", None)

+    @staticmethod
+    def validate_init_kwargs(processor_config, valid_kwargs):
+        kwargs_from_config = processor_config.keys()
+        unused_kwargs = {}
+        unused_keys = set(kwargs_from_config) - set(valid_kwargs)
+        if unused_keys:
+            unused_key_str = ", ".join(unused_keys)
+            logger.warning(
+                f"Some kwargs in processor config are unused and will not have any effect: {unused_key_str}. "
+            )
+            unused_kwargs = {k: processor_config[k] for k in unused_keys}
+        return unused_kwargs
+
    def apply_chat_template(
        self,
        conversation: Union[List[Dict[str, str]]],