diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 3a711a2805f..530d74d51b5 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -1013,10 +1013,12 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor): image_grid_thw = image_inputs["image_grid_thw"] if videos is not None: + # pop fps in advance for passing kwargs validation + fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) + videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] - fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) if isinstance(fps, (int, float)): second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw) elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw): diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index e3671dc3c43..8b05b725bf9 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -152,10 +152,12 @@ class Qwen2_5_VLProcessor(ProcessorMixin): image_grid_thw = image_inputs["image_grid_thw"] if videos is not None: + # pop fps in advance for passing kwargs validation + fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) + videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] - fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) if isinstance(fps, (int, float)): second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw) elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw): diff --git a/src/transformers/video_processing_utils.py b/src/transformers/video_processing_utils.py index 108dbada648..527760eb8da 100644 --- a/src/transformers/video_processing_utils.py +++ b/src/transformers/video_processing_utils.py @@ -250,7 +250,10 @@ class BaseVideoProcessor(BaseImageProcessorFast): videos: VideoInput, **kwargs: Unpack[VideosKwargs], ) -> BatchFeature: - validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys()) + validate_kwargs( + captured_kwargs=kwargs.keys(), + valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"], + ) # Set default kwargs from self. This ensures that if a kwarg is not provided # by the user, it gets its default value from the instance, or is set to None. for kwarg_name in self.valid_kwargs.__annotations__: