[qwen-omni] fix processor (#37493)

* fix * delete print * accept kwargs in overriden models as well * remove duplicate
2025-07-04 13:20:12 +06:00 · 2025-04-14 17:30:31 +02:00 · 2025-04-14 17:30:31 +02:00 · cb39f7dd5b
commit cb39f7dd5b
parent d228f50acc
4 changed files with 13 additions and 6 deletions
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@ -244,11 +244,13 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
                        curr_video_grid_thw = next(video_grid_thw)
                        height = curr_video_grid_thw[1] // self.image_processor.merge_size
                        width = curr_video_grid_thw[2] // self.image_processor.merge_size
-                        video_token_indices = np.arange(curr_video_grid_thw[0]).view(-1, 1, 1)
-                        video_token_indices = video_token_indices.expand(-1, height, width).flatten()
+                        video_token_indices = np.arange(curr_video_grid_thw[0]).reshape(-1, 1, 1)
+                        video_token_indices = np.broadcast_to(
+                            video_token_indices, (video_token_indices.shape[0], height, width)
+                        ).reshape(-1)
                        video_token_indices = (
                            video_token_indices * next(video_second_per_grid) * position_id_per_seconds
-                        ).long()
+                        )

                        tokens_per_chunk = int(position_id_per_seconds * seconds_per_chunk)
                        video_chunk_indexes = self.get_chunked_index(video_token_indices, tokens_per_chunk)
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@ -436,6 +436,7 @@ class SmolVLMProcessor(ProcessorMixin):
        fps: Optional[int] = None,
        backend: str = "opencv",
        skip_secs: int = 0.0,
+        **kwargs,
    ) -> np.array:
        """
        Loads `video` to a numpy array.
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@ -1427,7 +1427,6 @@ class ProcessorMixin(PushToHubMixin):

        # Fill sets of kwargs that should be used by different parts of template
        processed_kwargs = {
-            "processor_kwargs": {},
            "mm_load_kwargs": {},
            "template_kwargs": {},
        }
@ -1551,14 +1550,14 @@ class ProcessorMixin(PushToHubMixin):
            # without actionable solution for users
            single_prompt = prompt[0] if is_batched else prompt
            if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
-                processed_kwargs["processor_kwargs"]["add_special_tokens"] = False
+                kwargs["add_special_tokens"] = False

            out = self(
                text=prompt,
                images=batch_images if batch_images else None,
                videos=batch_videos if batch_videos else None,
                audio=batch_audios if batch_audios else None,
-                **processed_kwargs["processor_kwargs"],
+                **kwargs,
            )
            if return_dict:
                return out
@ -1574,6 +1573,7 @@ class ProcessorMixin(PushToHubMixin):
        num_frames: Optional[int] = None,
        fps: Optional[int] = None,
        backend: str = "opencv",
+        **kwargs,
    ) -> np.array:
        """
        Loads `video` to a numpy array.
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@ -228,6 +228,10 @@ class GenerationTesterMixin:
                "video_token_index",
                "video_token_id",
                "vision_start_token_id",
+                "audio_token_index",
+                "audio_start_token_id",
+                "audio_end_token_id",
+                "vision_end_token_id",
            ]:
                token_index = getattr(config, key, None)
                if token_index is None and hasattr(self, "model_tester"):