diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index e223bbe59c4..d607b8b95e8 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -244,11 +244,13 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
                         curr_video_grid_thw = next(video_grid_thw)
                         height = curr_video_grid_thw[1] // self.image_processor.merge_size
                         width = curr_video_grid_thw[2] // self.image_processor.merge_size
-                        video_token_indices = np.arange(curr_video_grid_thw[0]).view(-1, 1, 1)
-                        video_token_indices = video_token_indices.expand(-1, height, width).flatten()
+                        video_token_indices = np.arange(curr_video_grid_thw[0]).reshape(-1, 1, 1)
+                        video_token_indices = np.broadcast_to(
+                            video_token_indices, (video_token_indices.shape[0], height, width)
+                        ).reshape(-1)
                         video_token_indices = (
                             video_token_indices * next(video_second_per_grid) * position_id_per_seconds
-                        ).long()
+                        )
 
                         tokens_per_chunk = int(position_id_per_seconds * seconds_per_chunk)
                         video_chunk_indexes = self.get_chunked_index(video_token_indices, tokens_per_chunk)
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
index 4cf207c4591..615e3104d6c 100644
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -436,6 +436,7 @@ class SmolVLMProcessor(ProcessorMixin):
         fps: Optional[int] = None,
         backend: str = "opencv",
         skip_secs: int = 0.0,
+        **kwargs,
     ) -> np.array:
         """
         Loads `video` to a numpy array.
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 9593d465a75..b650972fb4f 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1427,7 +1427,6 @@ class ProcessorMixin(PushToHubMixin):
 
         # Fill sets of kwargs that should be used by different parts of template
         processed_kwargs = {
-            "processor_kwargs": {},
             "mm_load_kwargs": {},
             "template_kwargs": {},
         }
@@ -1551,14 +1550,14 @@ class ProcessorMixin(PushToHubMixin):
             # without actionable solution for users
             single_prompt = prompt[0] if is_batched else prompt
             if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
-                processed_kwargs["processor_kwargs"]["add_special_tokens"] = False
+                kwargs["add_special_tokens"] = False
 
             out = self(
                 text=prompt,
                 images=batch_images if batch_images else None,
                 videos=batch_videos if batch_videos else None,
                 audio=batch_audios if batch_audios else None,
-                **processed_kwargs["processor_kwargs"],
+                **kwargs,
             )
             if return_dict:
                 return out
@@ -1574,6 +1573,7 @@ class ProcessorMixin(PushToHubMixin):
         num_frames: Optional[int] = None,
         fps: Optional[int] = None,
         backend: str = "opencv",
+        **kwargs,
     ) -> np.array:
         """
         Loads `video` to a numpy array.
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index fc74c1ae714..5cda62c3974 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -228,6 +228,10 @@ class GenerationTesterMixin:
                 "video_token_index",
                 "video_token_id",
                 "vision_start_token_id",
+                "audio_token_index",
+                "audio_start_token_id",
+                "audio_end_token_id",
+                "vision_end_token_id",
             ]:
                 token_index = getattr(config, key, None)
                 if token_index is None and hasattr(self, "model_tester"):