diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index e223bbe59c4..d607b8b95e8 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -244,11 +244,13 @@ class Qwen2_5OmniProcessor(ProcessorMixin): curr_video_grid_thw = next(video_grid_thw) height = curr_video_grid_thw[1] // self.image_processor.merge_size width = curr_video_grid_thw[2] // self.image_processor.merge_size - video_token_indices = np.arange(curr_video_grid_thw[0]).view(-1, 1, 1) - video_token_indices = video_token_indices.expand(-1, height, width).flatten() + video_token_indices = np.arange(curr_video_grid_thw[0]).reshape(-1, 1, 1) + video_token_indices = np.broadcast_to( + video_token_indices, (video_token_indices.shape[0], height, width) + ).reshape(-1) video_token_indices = ( video_token_indices * next(video_second_per_grid) * position_id_per_seconds - ).long() + ) tokens_per_chunk = int(position_id_per_seconds * seconds_per_chunk) video_chunk_indexes = self.get_chunked_index(video_token_indices, tokens_per_chunk) diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py index 4cf207c4591..615e3104d6c 100644 --- a/src/transformers/models/smolvlm/processing_smolvlm.py +++ b/src/transformers/models/smolvlm/processing_smolvlm.py @@ -436,6 +436,7 @@ class SmolVLMProcessor(ProcessorMixin): fps: Optional[int] = None, backend: str = "opencv", skip_secs: int = 0.0, + **kwargs, ) -> np.array: """ Loads `video` to a numpy array. diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 9593d465a75..b650972fb4f 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1427,7 +1427,6 @@ class ProcessorMixin(PushToHubMixin): # Fill sets of kwargs that should be used by different parts of template processed_kwargs = { - "processor_kwargs": {}, "mm_load_kwargs": {}, "template_kwargs": {}, } @@ -1551,14 +1550,14 @@ class ProcessorMixin(PushToHubMixin): # without actionable solution for users single_prompt = prompt[0] if is_batched else prompt if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token): - processed_kwargs["processor_kwargs"]["add_special_tokens"] = False + kwargs["add_special_tokens"] = False out = self( text=prompt, images=batch_images if batch_images else None, videos=batch_videos if batch_videos else None, audio=batch_audios if batch_audios else None, - **processed_kwargs["processor_kwargs"], + **kwargs, ) if return_dict: return out @@ -1574,6 +1573,7 @@ class ProcessorMixin(PushToHubMixin): num_frames: Optional[int] = None, fps: Optional[int] = None, backend: str = "opencv", + **kwargs, ) -> np.array: """ Loads `video` to a numpy array. diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index fc74c1ae714..5cda62c3974 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -228,6 +228,10 @@ class GenerationTesterMixin: "video_token_index", "video_token_id", "vision_start_token_id", + "audio_token_index", + "audio_start_token_id", + "audio_end_token_id", + "vision_end_token_id", ]: token_index = getattr(config, key, None) if token_index is None and hasattr(self, "model_tester"):