diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index 3b9d5c369a4..cd96b46ab1d 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -39,10 +39,11 @@ class BlipProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = [] image_processor_class = "BlipImageProcessor" tokenizer_class = ("BertTokenizer", "BertTokenizerFast") - def __init__(self, image_processor, tokenizer): + def __init__(self, image_processor, tokenizer, **kwargs): tokenizer.return_token_type_ids = False super().__init__(image_processor, tokenizer) self.current_processor = self.image_processor diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index ff7044c82ae..2d526a17ba6 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -39,11 +39,12 @@ class Blip2Processor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = [] image_processor_class = "BlipImageProcessor" tokenizer_class = "AutoTokenizer" # Copied from transformers.models.blip.processing_blip.BlipProcessor.__init__ - def __init__(self, image_processor, tokenizer): + def __init__(self, image_processor, tokenizer, **kwargs): tokenizer.return_token_type_ids = False super().__init__(image_processor, tokenizer) self.current_processor = self.image_processor diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index 2e46cabfa3c..6b542ba3378 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -322,10 +322,11 @@ class FuyuProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = [] image_processor_class = "FuyuImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer): + def __init__(self, image_processor, tokenizer, **kwargs): super().__init__(image_processor=image_processor, tokenizer=tokenizer) self.image_processor = image_processor self.tokenizer = tokenizer diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 2afe2a49781..8e9e196764f 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -173,6 +173,7 @@ class IdeficsProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["image_size", "add_end_of_utterance_token"] image_processor_class = "IdeficsImageProcessor" tokenizer_class = "LlamaTokenizerFast" diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index 4edb1813b8e..c665ba74d06 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -61,6 +61,7 @@ class Idefics2Processor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["image_seq_len", "chat_template"] image_processor_class = "Idefics2ImageProcessor" tokenizer_class = "AutoTokenizer" diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index 4d266d8b98e..adebd22178e 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -40,15 +40,16 @@ class InstructBlipProcessor(ProcessorMixin): An instance of [`BlipImageProcessor`]. The image processor is a required input. tokenizer (`AutoTokenizer`): An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. - qformer_tokenizer (`AutoTokenizer`): + qformer_tokenizer (`AutoTokenizer`, *optional*): An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = [] image_processor_class = "BlipImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer, qformer_tokenizer): + def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) # add QFormer tokenizer @@ -167,7 +168,11 @@ class InstructBlipProcessor(ProcessorMixin): # overwrite to load the Q-Former tokenizer from a separate folder @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs) + + # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs' + if isinstance(processor, tuple): + processor = processor[0] qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer") - args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) - args.append(qformer_tokenizer) - return cls(*args) + processor.qformer_tokenizer = qformer_tokenizer + return processor diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index 09571580775..8310b68d736 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -40,15 +40,16 @@ class InstructBlipVideoProcessor(ProcessorMixin): An instance of [`InstructBlipVideoImageProcessor`]. The image processor is a required input. tokenizer (`AutoTokenizer`): An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. - qformer_tokenizer (`AutoTokenizer`): + qformer_tokenizer (`AutoTokenizer`, *optional*): An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input. """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = [] image_processor_class = "InstructBlipVideoImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor, tokenizer, qformer_tokenizer): + def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) # add QFormer tokenizer @@ -164,7 +165,11 @@ class InstructBlipVideoProcessor(ProcessorMixin): # overwrite to load the Q-Former tokenizer from a separate folder @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs) + + # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs' + if isinstance(processor, tuple): + processor = processor[0] qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer") - args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) - args.append(qformer_tokenizer) - return cls(*args) + processor.qformer_tokenizer = qformer_tokenizer + return processor diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py index a203ee4c506..6d1cce14b18 100644 --- a/src/transformers/models/kosmos2/processing_kosmos2.py +++ b/src/transformers/models/kosmos2/processing_kosmos2.py @@ -54,10 +54,11 @@ class Kosmos2Processor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["num_patch_index_tokens"] image_processor_class = "CLIPImageProcessor" tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast") - def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024): + def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs): tokenizer.return_token_type_ids = False self.eod_token = "" diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 96d38c53c94..e51e6ba0765 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -42,10 +42,11 @@ class LlavaProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template"] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor=None, tokenizer=None, chat_template=None): + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) def __call__( diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 6c2ca2f9028..7664b795430 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -42,10 +42,11 @@ class LlavaNextProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template"] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor=None, tokenizer=None, chat_template=None): + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) def __call__( diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index 24056680ea8..81426b3a0af 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -53,11 +53,12 @@ class LlavaNextVideoProcessor(ProcessorMixin): # video and image processor share same args, but have different processing logic # only image processor config is saved in the hub attributes = ["video_processor", "image_processor", "tokenizer"] + valid_kwargs = ["chat_template"] image_processor_class = "LlavaNextImageProcessor" video_processor_class = "LlavaNextVideoImageProcessor" tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") - def __init__(self, video_processor=None, image_processor=None, tokenizer=None, chat_template=None): + def __init__(self, video_processor=None, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template) def __call__( diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py index 37485f0e5cb..3d0ece60c36 100644 --- a/src/transformers/models/paligemma/processing_paligemma.py +++ b/src/transformers/models/paligemma/processing_paligemma.py @@ -85,9 +85,12 @@ class PaliGemmaProcessor(ProcessorMixin): The image processor is a required input. tokenizer ([`LlamaTokenizerFast`], *optional*): The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template"] image_processor_class = "SiglipImageProcessor" tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast") @@ -95,6 +98,8 @@ class PaliGemmaProcessor(ProcessorMixin): self, image_processor=None, tokenizer=None, + chat_template=None, + **kwargs, ): if image_processor is None: raise ValueError("You need to specify an `image_processor`.") @@ -113,7 +118,7 @@ class PaliGemmaProcessor(ProcessorMixin): tokenizer.add_bos_token = False tokenizer.add_eos_token = False - super().__init__(image_processor, tokenizer) + super().__init__(image_processor, tokenizer, chat_template=chat_template) def __call__( self, diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index e29c3478aaa..dcdb37bf0c7 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -37,14 +37,17 @@ class VideoLlavaProcessor(ProcessorMixin): The image processor is a required input. tokenizer ([`LlamaTokenizerFast`], *optional*): The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. """ attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template"] image_processor_class = "VideoLlavaImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor=None, tokenizer=None): - super().__init__(image_processor, tokenizer) + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): + super().__init__(image_processor, tokenizer, chat_template=chat_template) def __call__( self, diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 8f939cadfa0..7062a7699a7 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -320,6 +320,7 @@ class ProcessorMixin(PushToHubMixin): feature_extractor_class = None tokenizer_class = None _auto_class = None + valid_kwargs: List[str] = [] # args have to match the attributes class attribute def __init__(self, *args, **kwargs): @@ -648,14 +649,15 @@ class ProcessorMixin(PushToHubMixin): processor_dict = processor_dict.copy() return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) - # Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`. - # We have to pop up some unused (but specific) arguments to make it work. + # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs + # If we don't pop, some specific kwargs will raise a warning if "processor_class" in processor_dict: del processor_dict["processor_class"] if "auto_map" in processor_dict: del processor_dict["auto_map"] + unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs) processor = cls(*args, **processor_dict) # Update processor with kwargs if needed @@ -663,6 +665,7 @@ class ProcessorMixin(PushToHubMixin): if hasattr(processor, key): setattr(processor, key, kwargs.pop(key)) + kwargs.update(unused_kwargs) logger.info(f"Processor {processor}") if return_unused_kwargs: return processor, kwargs @@ -887,6 +890,19 @@ class ProcessorMixin(PushToHubMixin): first_attribute = getattr(self, self.attributes[0]) return getattr(first_attribute, "model_input_names", None) + @staticmethod + def validate_init_kwargs(processor_config, valid_kwargs): + kwargs_from_config = processor_config.keys() + unused_kwargs = {} + unused_keys = set(kwargs_from_config) - set(valid_kwargs) + if unused_keys: + unused_key_str = ", ".join(unused_keys) + logger.warning( + f"Some kwargs in processor config are unused and will not have any effect: {unused_key_str}. " + ) + unused_kwargs = {k: processor_config[k] for k in unused_keys} + return unused_kwargs + def apply_chat_template( self, conversation: Union[List[Dict[str, str]]],