[video processors] support frame sampling within processors (#38105)

* apply updates smolVLM (still needs workaround for chat template) * add other models * dump qwen omni for now, come back later * port qwen omni from their impl * wait, all qwens sample videos in same way! * clean up * make smolvlm backwards compatible and fix padding * dix some tests * fox smolvlm tests * more clean up and test fixing * delete unused arg * fix * address comments * style * fix test
2025-07-03 04:40:06 +06:00 · 2025-06-12 11:34:30 +02:00 · 2025-06-12 11:34:30 +02:00 · 27459025b8
commit 27459025b8
parent 887054c714
25 changed files with 864 additions and 795 deletions
--- a/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/video_processing_instructblipvideo.py
@ -35,7 +35,7 @@ from ...utils import (
 )
 from ...utils.import_utils import requires
 from ...video_processing_utils import BaseVideoProcessor
-from ...video_utils import group_videos_by_shape, reorder_videos
+from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos


 if is_vision_available():
@ -66,6 +66,7 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
    do_rescale = True
    do_normalize = True
    do_convert_rgb = True
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
    valid_kwargs = InstructBlipVideoVideoProcessorInitKwargs
    model_input_names = ["pixel_values"]

@ -75,6 +76,7 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
    def _preprocess(
        self,
        videos: List["torch.Tensor"],
+        video_metadata: Union[List[VideoMetadata], List[dict]],
        do_convert_rgb: bool,
        do_resize: bool,
        size: SizeDict,
@ -86,10 +88,18 @@ class InstructBlipVideoVideoProcessor(BaseVideoProcessor):
        do_pad: bool,
        rescale_factor: float,
        do_normalize: bool,
+        do_sample_frames: bool,
        image_mean: Optional[Union[float, List[float]]],
        image_std: Optional[Union[float, List[float]]],
-        return_tensors: Optional[Union[str, TensorType]],
+        fps: Optional[int] = None,
+        num_frames: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
    ) -> BatchFeature:
+        if do_sample_frames:
+            videos = [
+                self.sample_frames(video, metadata, num_frames, fps) for video, metadata in zip(videos, video_metadata)
+            ]
+
        # Group videos by size for batched resizing
        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
        resized_videos_grouped = {}
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@ -21,7 +21,7 @@ from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images
 from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...video_utils import VideoInput, VideoMetadata, load_video, make_batched_videos
+from ...video_utils import VideoInput, make_batched_videos


 class InternVLImagesKwargs(ImagesKwargs, total=False):
@ -290,32 +290,6 @@ class InternVLProcessor(ProcessorMixin):

        return MultiModalData(**vision_data)

-    def sample_indices_fn(
-        self, metadata: VideoMetadata, num_frames: Optional[int] = None, initial_shift: Union[bool, float, int] = True
-    ):
-        """
-        The function to generate indices of frames to sample from a video.
-
-        Args:
-            metadata (`VideoMetadata`):
-                `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
-            num_frames (`int`, *optional*):
-                Number of frames to sample uniformly. If None, all frames are sampled.
-            initial_shift (`bool`, `float` or `int`, defaults to `0`):
-                The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.
-
-        Returns:
-            `np.ndarray`: Array of frame indices to sample.
-        """
-        num_frames = num_frames if num_frames is not None else metadata.total_num_frames
-
-        if initial_shift is True:
-            initial_shift = metadata.total_num_frames / num_frames / 2
-        indices = np.arange(initial_shift, metadata.total_num_frames, metadata.total_num_frames / num_frames).astype(
-            int
-        )
-        return indices
-
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
@ -336,39 +310,5 @@ class InternVLProcessor(ProcessorMixin):
        image_processor_input_names = self.image_processor.model_input_names
        return list(tokenizer_input_names) + list(image_processor_input_names)

-    # TODO: raushan, has to be public method under `VideoProcessorBase` when API is added
-    def _load_video_for_model(
-        self,
-        video: Union[str, "VideoInput"],
-        num_frames: Optional[int],
-        backend: str = "pyav",
-        initial_shift: bool = True,
-        **kwargs,
-    ) -> np.array:
-        """
-        Loads `video` to a numpy array.
-
-        Args:
-            video (`str` or `VideoInput`):
-                The video to convert to the numpy array format. Can be a link to video or local path.
-            num_frames (`int`, *optional*):
-                Number of frames to sample uniformly. If not passed, the whole video is loaded.
-            backend (`str`, *optional*, defaults to `"pyav"`):
-                The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav".
-            initial_shift (`bool`, *optional*, defaults to `True`):
-                The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.
-
-        Returns:
-            Tuple[`np.array`, Dict]: A tuple containing:
-                - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
-                - Metadata dictionary.
-        """
-
-        def sample_indices_fn_func(metadata, **fn_kwargs):
-            return self.sample_indices_fn(metadata, num_frames=num_frames, initial_shift=initial_shift, **fn_kwargs)
-
-        video, metadata = load_video(video, backend=backend, sample_indices_fn=sample_indices_fn_func)
-        return video, metadata
-

 __all__ = ["InternVLProcessor"]
--- a/src/transformers/models/internvl/video_processing_internvl.py
+++ b/src/transformers/models/internvl/video_processing_internvl.py
@ -14,25 +14,43 @@
 # limitations under the License.
 """Fast Video processor class for InternVL."""

+from typing import List, Optional, Union
+
+from ...image_processing_utils import BatchFeature
 from ...image_utils import (
    OPENAI_CLIP_MEAN,
    OPENAI_CLIP_STD,
+    SizeDict,
 )
 from ...processing_utils import Unpack, VideosKwargs
 from ...utils import (
+    TensorType,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
    is_vision_available,
 )
 from ...utils.import_utils import requires
-from ...video_processing_utils import (
-    BaseVideoProcessor,
-)
+from ...video_processing_utils import BaseVideoProcessor
+from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos


+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+if is_torch_available():
+    import torch
+
 if is_vision_available():
    from ...image_utils import PILImageResampling


-class InternVLVideoProcessorInitKwargs(VideosKwargs): ...
+class InternVLVideoProcessorInitKwargs(VideosKwargs):
+    initial_shift: Union[bool, float, int]


@requires(backends=("torchvision",))
@ -45,11 +63,128 @@ class InternVLVideoProcessor(BaseVideoProcessor):
    do_rescale = True
    do_normalize = True
    do_convert_rgb = True
+    initial_shift = True
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
    valid_kwargs = InternVLVideoProcessorInitKwargs
    model_input_names = ["pixel_values_videos"]

    def __init__(self, **kwargs: Unpack[InternVLVideoProcessorInitKwargs]):
        super().__init__(**kwargs)

+    def sample_frames(
+        self,
+        video: "torch.Tensor",
+        metadata: Optional[Union[VideoMetadata, dict]] = None,
+        num_frames: Optional[int] = None,
+        fps: Optional[int] = None,
+        initial_shift: Optional[Union[bool, float, int]] = None,
+    ):
+        """
+        Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
+        If `fps` is passed along with metadata, `fps` frames per second are sampled uniformty. Arguments `num_frames`
+        and `fps` are mutually exclusive.
+
+        Args:
+            video (`torch.Tensor`):
+                Video that need to be sampled.
+            metadata (`VideoMetadata`, *optional*):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            fps (`int`, *optional*):
+                Target frames to sample per second. Defaults to `self.fps`.
+            initial_shift (`bool`, `float` or `int`, defaults to `self.initial_shift`):
+                The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.
+
+        Returns:
+            torch.Tensor:
+                Sampled video frames.
+        """
+        num_frames = num_frames if num_frames is not None else self.num_frames
+        initial_shift = initial_shift if initial_shift is not None else self.initial_shift
+        total_num_frames = video.shape[0]
+
+        # If num_frames is not given but fps is, calculate num_frames from fps
+        if num_frames is None and fps is not None:
+            if metadata is None:
+                raise ValueError(
+                    "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
+                    "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
+                )
+            num_frames = int(total_num_frames / metadata["fps"] * fps)
+
+        if initial_shift is True:
+            initial_shift = total_num_frames / num_frames / 2
+
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"Video can't be sampled. The `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
+            )
+
+        indices = torch.arange(initial_shift, total_num_frames, total_num_frames / num_frames).int()
+        video = video[indices].contiguous()
+        return video
+
+    def _preprocess(
+        self,
+        videos: List["torch.Tensor"],
+        video_metadata: Union[List[VideoMetadata], List[dict]],
+        do_convert_rgb: bool,
+        do_resize: bool,
+        size: SizeDict,
+        size_divisor: Optional[int],
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        do_pad: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        do_sample_frames: Optional[bool] = None,
+        fps: Optional[int] = None,
+        num_frames: Optional[int] = None,
+        initial_shift: Optional[Union[bool, float, int]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if do_sample_frames:
+            # Sample video frames
+            videos = [
+                self.sample_frames(video, metadata, fps=fps, num_frames=num_frames, initial_shift=initial_shift)
+                for video, metadata in zip(videos, video_metadata)
+            ]
+
+        # Group videos by size for batched resizing
+        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
+        resized_videos_grouped = {}
+        for shape, stacked_videos in grouped_videos.items():
+            if do_convert_rgb:
+                stacked_videos = self.convert_to_rgb(stacked_videos)
+            if do_resize:
+                stacked_videos = self.resize(
+                    stacked_videos, size=size, size_divisor=size_divisor, interpolation=interpolation
+                )
+            resized_videos_grouped[shape] = stacked_videos
+        resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index)
+
+        # Group videos by size for further processing
+        # Needed in case do_resize is False, or resize returns videos with different sizes
+        grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos)
+        processed_videos_grouped = {}
+        for shape, stacked_videos in grouped_videos.items():
+            if do_center_crop:
+                stacked_videos = self.center_crop(stacked_videos, crop_size)
+            # Fused rescale and normalize
+            stacked_videos = self.rescale_and_normalize(
+                stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_videos_grouped[shape] = stacked_videos
+
+        processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index)
+        processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
+
+        return BatchFeature(data={"pixel_values_videos": processed_videos}, tensor_type=return_tensors)
+

 __all__ = ["InternVLVideoProcessor"]
--- a/src/transformers/models/llava_next_video/video_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/video_processing_llava_next_video.py
@ -46,6 +46,7 @@ class LlavaNextVideoVideoProcessor(BaseVideoProcessor):
    do_rescale = True
    do_normalize = True
    do_convert_rgb = True
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
    valid_kwargs = LlavaNextVideoFastVideoProcessorInitKwargs
    model_input_names = ["pixel_values_videos"]

--- a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
@ -47,6 +47,7 @@ class LlavaOnevisionVideoProcessor(BaseVideoProcessor):
    do_rescale = True
    do_normalize = True
    do_convert_rgb = True
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
    valid_kwargs = LlavaOnevisionFastVideoProcessorInitKwargs
    model_input_names = ["pixel_values_videos"]

--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@ -154,7 +154,7 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
        seconds_per_chunk = output_kwargs["videos_kwargs"].pop("seconds_per_chunk")
        position_id_per_seconds = output_kwargs["videos_kwargs"].pop("position_id_per_seconds")
        use_audio_in_video = output_kwargs["videos_kwargs"].pop("use_audio_in_video")
-        fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
+        fps = output_kwargs["videos_kwargs"].get("fps", 2.0)

        if audio is not None:
            output_kwargs["audio_kwargs"]["padding"] = "max_length"  # Support "max_length" padding only here
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@ -928,7 +928,6 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
            "padding": False,
            "return_mm_token_type_ids": False,
        },
-        "videos_kwargs": {"fps": 2.0},
    }


@ -1013,9 +1012,7 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
            image_grid_thw = image_inputs["image_grid_thw"]

        if videos is not None:
-            # pop fps in advance for passing kwargs validation
-            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
-
+            fps = output_kwargs["videos_kwargs"].get("fps", 2.0)
            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
            video_grid_thw = videos_inputs["video_grid_thw"]

--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@ -54,7 +54,6 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
            "padding": False,
            "return_mm_token_type_ids": False,
        },
-        "videos_kwargs": {"fps": 2.0},
    }


@ -151,9 +150,7 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
            image_grid_thw = image_inputs["image_grid_thw"]

        if videos is not None:
-            # pop fps in advance for passing kwargs validation
-            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
-
+            fps = output_kwargs["videos_kwargs"].get("fps", 2.0)
            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
            video_grid_thw = videos_inputs["video_grid_thw"]

--- a/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/video_processing_qwen2_vl.py
@ -19,6 +19,7 @@
 # limitations under the License.
 """video processor class for Qwen2-VL."""

+import math
 from typing import List, Optional, Union

 from ...image_processing_utils import (
@ -45,7 +46,7 @@ from ...video_processing_utils import (
    BASE_VIDEO_PROCESSOR_DOCSTRING,
    BaseVideoProcessor,
 )
-from ...video_utils import group_videos_by_shape, reorder_videos
+from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos


 if is_vision_available():
@ -69,6 +70,8 @@ class Qwen2VLVideoProcessorInitKwargs(VideosKwargs):
    patch_size: Optional[int]
    temporal_patch_size: Optional[int]
    merge_size: Optional[int]
+    min_frames: Optional[int]
+    max_frames: Optional[int]


@add_start_docstrings(
@ -85,23 +88,30 @@ class Qwen2VLVideoProcessorInitKwargs(VideosKwargs):
            The temporal patch size of the vision encoder.
        merge_size (`int`, *optional*, defaults to 2):
            The merge size of the vision encoder to llm encoder.
+        min_frames (`int`, *optional*, defaults to 4):
+            The minimum number of frames that can be sampled.
+        max_frames (`int`, *optional*, defaults to 768):
+            The maximum number of frames that can be sampled.
    """,
 )
@requires(backends=("torchvision",))
 class Qwen2VLVideoProcessor(BaseVideoProcessor):
    resample = PILImageResampling.BICUBIC
-    size = {"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280}
+    size = {"shortest_edge": 128 * 28 * 28, "longest_edge": 28 * 28 * 768}
    image_mean = OPENAI_CLIP_MEAN
    image_std = OPENAI_CLIP_STD
    do_resize = True
    do_rescale = True
    do_normalize = True
    do_convert_rgb = True
-    min_pixels = 56 * 56
-    max_pixels = 28 * 28 * 1280
+    min_pixels = 128 * 28 * 28
+    max_pixels = 28 * 28 * 768
    patch_size = 14
    temporal_patch_size = 2
    merge_size = 2
+    min_frames = 4
+    max_frames = 768
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
    valid_kwargs = Qwen2VLVideoProcessorInitKwargs
    model_input_names = ["pixel_values_videos", "video_grid_thw"]

@ -109,9 +119,80 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
        super().__init__(**kwargs)
        self.size = {"shortest_edge": self.min_pixels, "longest_edge": self.max_pixels}

+    def sample_frames(
+        self,
+        video: "torch.Tensor",
+        frame_factor: int,
+        min_frames: int,
+        max_frames: int,
+        metadata: Optional[Union[VideoMetadata, dict]] = None,
+        num_frames: Optional[int] = None,
+        fps: Optional[int] = None,
+    ):
+        """
+        Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
+        If `fps` is passed along with metadata, `fps` frames per second are sampled uniformty. Arguments `num_frames`
+        and `fps` are mutually exclusive.
+
+        Args:
+            video (`torch.Tensor`):
+                Video that need to be sampled.
+            frame_factor (`int`):
+                The temporal patch size of the vision encoder. Number of sampled frames will be rounded to be divisible by frame factor.
+            min_frames (`int`):
+                The minimum number of frames that can be sampled.
+            max_frames (`int`):
+                The maximum number of frames that can be sampled.
+            metadata (`VideoMetadata`, *optional*):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            fps (`int`, *optional*):
+                Target frames to sample per second. Defaults to `self.fps`.
+
+        Returns:
+            torch.Tensor:
+                Sampled video frames.
+        """
+        if fps is not None and num_frames is not None:
+            raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
+
+        num_frames = num_frames if num_frames is not None else self.num_frames
+        fps = fps if fps is not None else self.fps
+        total_num_frames = video.shape[0]
+
+        # If num_frames is not given but fps is, calculate num_frames from fps
+        if num_frames is not None:
+            num_frames = round(num_frames / frame_factor) * frame_factor
+        elif fps is not None:
+            if metadata is None:
+                raise ValueError(
+                    "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
+                    "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
+                )
+            max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
+            num_frames = total_num_frames / metadata["fps"] * fps
+            num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
+            num_frames = math.floor(num_frames / frame_factor) * frame_factor
+
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
+                "Decrease `num_frames` or `fps` for sampling."
+            )
+
+        if num_frames is not None:
+            indices = torch.arange(0, total_num_frames, total_num_frames / num_frames).int()
+        else:
+            indices = torch.arange(0, total_num_frames).int()
+        video = video[indices].contiguous()
+
+        return video
+
    def _preprocess(
        self,
        videos: List["torch.Tensor"],
+        video_metadata: Union[List[VideoMetadata], List[dict]],
        do_convert_rgb: bool,
        do_resize: bool,
        size: SizeDict,
@ -119,6 +200,7 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
        do_rescale: bool,
        rescale_factor: float,
        do_normalize: bool,
+        do_sample_frames: bool,
        image_mean: Optional[Union[float, List[float]]],
        image_std: Optional[Union[float, List[float]]],
        min_pixels: Optional[int] = None,
@ -126,9 +208,28 @@ class Qwen2VLVideoProcessor(BaseVideoProcessor):
        patch_size: Optional[int] = None,
        temporal_patch_size: Optional[int] = None,
        merge_size: Optional[int] = None,
+        fps: Optional[int] = None,
+        num_frames: Optional[int] = None,
+        min_frames: Optional[int] = None,
+        max_frames: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ):
+        if do_sample_frames:
+            # Sample video frames
+            videos = [
+                self.sample_frames(
+                    video,
+                    frame_factor=temporal_patch_size,
+                    min_frames=min_frames,
+                    max_frames=max_frames,
+                    metadata=metadata,
+                    num_frames=num_frames,
+                    fps=fps,
+                )
+                for video, metadata in zip(videos, video_metadata)
+            ]
+
        # Group videos by size for batched resizing
        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
        resized_videos_grouped = {}
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@ -16,18 +16,15 @@
 Processor class for SmolVLM.
 """

-import copy
 from datetime import timedelta
 from typing import TYPE_CHECKING, Dict, List, Optional, Union

-import numpy as np
-
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import AllKwargsForChatTemplate, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, TextInput
 from ...utils import is_num2words_available, is_vision_available, logging
-from ...video_utils import VideoInput, load_video, make_batched_videos
+from ...video_utils import VideoInput


 if is_vision_available():
@ -35,7 +32,13 @@ if is_vision_available():
        DEFAULT_MEDIA_OUTTRO,
        DEFAULT_VIDEO_INTRO,
        FRAME_TIMESTAMP_MESSAGE,
-        smolvlm_sample_indices_fn,
+    )
+
+if is_vision_available():
+    from .video_processing_smolvlm import (
+        DEFAULT_MEDIA_OUTTRO,
+        DEFAULT_VIDEO_INTRO,
+        FRAME_TIMESTAMP_MESSAGE,
    )

 if TYPE_CHECKING:
@ -50,6 +53,10 @@ else:
    num2words = None


+# The correct chat template to be used for videos after #38105
+DEFAULT_CHAT_TEMPLATE = "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% elif line['type'] == 'video' %}{{ '<video>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
+
+
 def _prompt_split_image(
    image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_image_token
 ):
@ -140,9 +147,7 @@ class SmolVLMProcessor(ProcessorMixin):

    attributes = ["image_processor", "tokenizer", "video_processor"]
    image_processor_class = "SmolVLMImageProcessor"
-    video_processor_class = (
-        "SmolVLMImageProcessor"  # TODO: raushan should be VideoProcessor when LANCZOS resizing is settled
-    )
+    video_processor_class = "SmolVLMVideoProcessor"  # NOTE: uses different interpolation than slow processors
    tokenizer_class = "AutoTokenizer"

    def __init__(
@ -160,17 +165,7 @@ class SmolVLMProcessor(ProcessorMixin):
        self.end_of_utterance_token = getattr(tokenizer, "end_of_utterance_token", "<end_of_utterance>")
        self.global_image_token = getattr(tokenizer, "global_image_token", "<global-img>")
        self.image_seq_len = image_seq_len
-
-        self.video_size = video_processor.video_sampling["video_size"]
-        self.image_size = image_processor.size
-
-        self.do_image_splitting = image_processor.do_image_splitting
-        self.do_video_splitting = video_processor.video_sampling.get("do_image_splitting", False)
-
-        self.default_max_frames = video_processor.video_sampling["max_frames"]
-        self.default_fps = video_processor.video_sampling["fps"]
-        # Matches one or more occurrences of <row_x_col_y> tags (where x and y are digits, optionally surrounded by newline characters
-        # self._regex_to_remove_extra_special_tokens = re.compile(r"(<row_\d+_col_\d+>\n?)+")
+        self.video_token = getattr(tokenizer, "video_token", "<video>")

        if not num2words:
            raise ImportError(
@ -179,16 +174,12 @@ class SmolVLMProcessor(ProcessorMixin):

        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template, **kwargs)

-    def process_vision(
-        self, text, images, output_kwargs, do_image_splitting=False, image_processor_size=None, processor=None
-    ):
+    def process_vision(self, text, images, output_kwargs):
        if text is not None:
            n_images_in_text = [sample.count(self.image_token) for sample in text]

        n_images_in_images = [len(sublist) for sublist in images]
-        image_inputs = processor(
-            images, do_image_splitting=do_image_splitting, size=image_processor_size, **output_kwargs
-        )
+        image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])

        if text is None:
            return None, image_inputs
@ -227,6 +218,50 @@ class SmolVLMProcessor(ProcessorMixin):

        return prompt_strings, image_inputs

+    def process_video(self, text, videos, output_kwargs):
+        if text is not None:
+            n_videos_in_text = [sample.count(self.video_token) for sample in text]
+
+        n_videos_in_videos = [len(sublist) for sublist in videos]
+        video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
+
+        num_frames = video_inputs["pixel_values"].shape[1]
+        batch_timestamps = iter(video_inputs.pop("timestamps"))
+        batch_durations = iter(video_inputs.pop("durations"))
+
+        if text is None:
+            return None, video_inputs
+
+        if n_videos_in_videos != n_videos_in_text:
+            raise ValueError(
+                f"The number of videos in the text {n_videos_in_text} and videos {n_videos_in_videos} should be the same."
+            )
+
+        prompt_strings = []
+        for sample in text:
+            while self.video_token in sample:
+                timestamps = next(batch_timestamps)
+                duration = next(batch_durations)
+                duration_td = timedelta(seconds=int(duration))
+                image_prompt_strings = DEFAULT_VIDEO_INTRO.format(
+                    frame_count=num2words(num_frames), video_duration=str(duration_td)
+                )
+                for timestamp in timestamps:
+                    image_prompt_string = _prompt_single_image(
+                        self.image_seq_len,
+                        image_token=self.image_token,
+                        fake_token_around_image=self.fake_image_token,
+                        global_image_token=self.global_image_token,
+                    )
+                    timestamp = f"{timestamp[0]:02d}:{timestamp[1]:02d}"
+                    image_prompt_string = FRAME_TIMESTAMP_MESSAGE.format(timestamp=timestamp) + image_prompt_string
+                    image_prompt_strings += image_prompt_string
+
+                image_prompt_strings += DEFAULT_MEDIA_OUTTRO
+                sample = sample.replace(self.video_token, image_prompt_strings, 1)
+            prompt_strings.append(sample)
+        return prompt_strings, video_inputs
+
    def __call__(
        self,
        images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None,
@ -310,21 +345,14 @@ class SmolVLMProcessor(ProcessorMixin):
            text, vision_inputs = self.process_vision(
                text,
                images,
-                output_kwargs["images_kwargs"],
-                do_image_splitting=self.do_image_splitting,
-                image_processor_size=self.image_size,
-                processor=self.image_processor,
+                output_kwargs,
            )
            inputs.update(vision_inputs)
        elif videos is not None:
-            videos = make_batched_videos(videos)
-            text, vision_inputs = self.process_vision(
+            text, vision_inputs = self.process_video(
                text,
                videos,
-                output_kwargs["videos_kwargs"],
-                do_image_splitting=self.do_image_splitting,
-                image_processor_size=self.video_size,
-                processor=self.video_processor,
+                output_kwargs,
            )
            inputs.update(vision_inputs)

@ -337,93 +365,6 @@ class SmolVLMProcessor(ProcessorMixin):

        return BatchFeature(inputs, tensor_type=return_tensors)

-    def _process_messages_for_chat_template(
-        self,
-        conversations: List[List[Dict[str, str]]],
-        batch_images: List[ImageInput],
-        batch_videos: List[VideoInput],
-        batch_video_metadata: List[List[Dict[str, any]]],
-        **chat_template_kwargs,
-    ):
-        """
-        Used within `apply_chat_template` when a model has special way to process conversation history. For example,
-        video models might want to specify in the prompt the duration of video or which frame indices at which timestamps
-        were sampled. This information cannot be accessed before the video is loaded.
-        For most models it is a no-op, must be overridden by model processors which require special processing.
-        Args:
-            conversation (`List[Dict, str, str]`):
-                The conversation to process. Always comes in batched format.
-            batch_images (`List[List[ImageInput]]`):
-                Batch of images that were loaded from url/path defined in the conversation. The images
-                are ordered in the same way as in the conversation. Comes in nested list format, one list of `PIL` images
-                per batch.
-            batch_videos (`List[List[ImageInput]]`):
-                Batch of videos that were loaded from url/path defined in the conversation. The videos
-                are ordered in the same way as in the conversation. Comes in nested list format, one list of 4D video arrays
-                per batch.
-            batch_video_metadata (`List[List[Dict[[str, any]]]]`):
-                Batch of metadata returned from loading videos. That includes video fps, duration and total number of framer in original video.
-                Metadata are ordered in the same way as `batch_videos`. Comes in nested list format, one list of 4D video arrays
-                per batch.
-        """
-        # We don't want to modify in-place the messages passed by user
-        # The user might want to add new turn on conv and continue generation
-        conversations = copy.deepcopy(conversations)
-        batch_num_frames, batch_timestamps = [], []
-        for metadata_list, video_list in zip(batch_video_metadata, batch_videos):
-            for metadata, video in zip(metadata_list, video_list):
-                duration_sec = getattr(metadata, "duration")
-                frames_idx = getattr(metadata, "frames_indices")
-                fps = getattr(metadata, "fps")
-
-                timestamps = []
-                for idx, frame_np in zip(frames_idx, video):
-                    sec = idx / fps
-                    mm = int(sec // 60)
-                    ss = int(sec % 60)
-                    timestamps.append(f"{mm:02d}:{ss:02d}")
-                batch_timestamps.append(timestamps)
-                batch_num_frames.append(len(video))
-
-        for conversation in conversations:
-            # For each message, scan content for {"type": "video"}
-            for msg in conversation:
-                if "content" not in msg:
-                    continue
-
-                new_content = []
-                for block in msg["content"]:
-                    if block.get("type") == "video":
-                        curr_timestamps = batch_timestamps.pop(0)
-                        curr_num_frames = batch_num_frames.pop(0)
-
-                        # Build the video intro texts
-                        td = timedelta(seconds=int(duration_sec))
-                        new_content.append(
-                            {
-                                "type": "text",
-                                "text": DEFAULT_VIDEO_INTRO.format(
-                                    frame_count=num2words(curr_num_frames), video_duration=str(td)
-                                ),
-                            }
-                        )
-
-                        # 2) Insert per-frame lines: "Frame from {timestamp}:", then an "image" block
-                        for i, ts in enumerate(curr_timestamps):
-                            new_content.append({"type": "text", "text": FRAME_TIMESTAMP_MESSAGE.format(timestamp=ts)})
-                            new_content.append({"type": "image"})
-
-                        # 3) Optionally add an outro (e.g. "Now answer the question:")
-                        new_content.append({"type": "text", "text": DEFAULT_MEDIA_OUTTRO})
-                        # Do NOT add the original block => we skip it (since we've replaced it)
-                    else:
-                        # keep original block
-                        new_content.append(block)
-
-                # update the content
-                msg["content"] = new_content
-        return conversations
-
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to SmolVLMTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
@ -446,45 +387,54 @@ class SmolVLMProcessor(ProcessorMixin):
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(image_processor_input_names + tokenizer_input_names))

-    # TODO: raushan, has to be public method under `VideoProcessorBase` when API is added
-    def _load_video_for_model(
+    def apply_chat_template(
        self,
-        video: Union[str, "VideoInput"],
-        num_frames: Optional[int] = None,
-        fps: Optional[int] = None,
-        backend: str = "opencv",
-        skip_secs: int = 0.0,
-        **kwargs,
-    ) -> np.array:
+        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
+        chat_template: Optional[str] = None,
+        **kwargs: Unpack[AllKwargsForChatTemplate],
+    ) -> str:
        """
-        Loads `video` to a numpy array.
+        Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
+        conversations to turn them into a single tokenizable string.
+
+        The input is expected to be in the following format, where each message content is a list consisting of text and
+        optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
+        `pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "text", "text": "Please describe this image in detail."},
+                ],
+            },
+        ]

        Args:
-            video (`str` or `VideoInput`):
-                The video to convert to the numpy array format. Can be a link to video or local path.
-            num_frames (`int`, *optional*):
-                Number of frames to sample uniformly. If not passed, the whole video is loaded.
-            fps (`int`, *optional*):
-                Number of frames to sample per second. Should be passed only when `num_frames=None`.
-                If not specified and `num_frames==None`, all frames are sampled.
-            backend (`str`, *optional*, defaults to `"opencv"`):
-                The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".
-
-        Returns:
-            Tuple[`np.array`, Dict]: A tuple containing:
-                - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
-                - Metadata dictionary.
+            conversation (`Union[List[Dict, [str, str]], List[List[Dict[str, str]]]]`):
+                The conversation to format.
+            chat_template (`Optional[str]`, *optional*):
+                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
+                chat template is used.
        """
-        max_frames = self.default_max_frames if num_frames is None else num_frames
-        target_fps = self.default_fps if fps is None else fps
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
+        ):
+            conversations = conversation
+        else:
+            conversations = [conversation]

-        def sample_indices_fn_func(metadata, **fn_kwargs):
-            return smolvlm_sample_indices_fn(
-                metadata, max_frames=max_frames, target_fps=target_fps, skip_secs=skip_secs, **fn_kwargs
-            )
-
-        video, metadata = load_video(video, backend=backend, sample_indices_fn=sample_indices_fn_func)
-        return video, metadata
+        has_video = any(
+            (isinstance(content, dict) and content["type"] == "video")
+            for conversation in conversations
+            for message in conversation
+            for content in message["content"]
+        )
+        if chat_template is None and has_video:
+            # re-assign to the correct default template for BC, if user is not requesting their own template
+            chat_template = DEFAULT_CHAT_TEMPLATE
+        return super().apply_chat_template(conversation, chat_template, **kwargs)


 __all__ = ["SmolVLMProcessor"]
--- a/src/transformers/models/smolvlm/video_processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py
@ -13,13 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
 from typing import List, Optional, Union

 import numpy as np

 from ...image_processing_utils import (
    BatchFeature,
+    get_size_dict,
 )
 from ...image_utils import (
    IMAGENET_STANDARD_MEAN,
@ -38,7 +38,7 @@ from ...utils.import_utils import requires
 from ...video_processing_utils import (
    BaseVideoProcessor,
 )
-from ...video_utils import group_videos_by_shape, reorder_videos
+from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos


 if is_vision_available():
@ -68,66 +68,6 @@ FRAME_TIMESTAMP_MESSAGE = "\nFrame from {timestamp}:"
 MAX_IMAGE_SIZE = 4096  # 4k resolution as absolute maximum


-def smolvlm_sample_indices_fn(metadata, max_frames, target_fps, skip_secs=0):
-    """
-    Example sampling function which:
-      - Uses `max_frames` (if provided) or calculates it from `fps` and metadata.
-      - Applies a basic center-skip if fewer frames than available, otherwise
-        optionally skips `skip_secs` from both the start and end.
-      - Uniformly samples the desired number of frames between the start and end indices.
-
-    Args:
-        max_frames (`int`):
-            Maximum number of frames to sample.
-        target_fps (`int`):
-            Target frames to sample per second.
-        metadata (`dict`):
-            Contains video metadata such as "n_frames" and "video_fps".
-        skip_secs (`float`, *optional*, defaults to 1.0):
-            Number of seconds to skip from the start and end if the video is long enough.
-
-    Returns:
-        numpy.ndarray:
-            An array of unique frame indices to sample.
-    """
-
-    total_num_frames = getattr(metadata, "total_num_frames", 0)
-    if total_num_frames <= 0:
-        raise ValueError(f"Invalid total_num_frames={total_num_frames} in metadata.")
-
-    native_fps = getattr(metadata, "fps", 30.0)
-    duration_seconds = getattr(metadata, "duration", 0)
-
-    if duration_seconds <= 0:
-        raise ValueError(f"Invalid duration_seconds={duration_seconds} in metadata.")
-
-    # Step 1) Estimate how many frames we'd sample at `target_fps`, fallback if target_fps <= 0
-    estimated_frames = int(round(target_fps * duration_seconds))
-
-    # Step 2) desired_frames
-    desired_frames = min(estimated_frames, max_frames)
-    if desired_frames < 1:
-        desired_frames = 1
-
-    # Step 3) center skip logic
-    start_idx = 0
-    end_idx = total_num_frames - 1
-
-    if skip_secs > 0 and (duration_seconds - 2 * skip_secs) > (max_frames * target_fps):
-        start_idx = int(skip_secs * native_fps)
-        end_idx = int(total_num_frames - skip_secs * native_fps)
-
-    start_idx = max(0, start_idx)
-    end_idx = min(end_idx, total_num_frames - 1)
-    if start_idx >= end_idx:
-        start_idx, end_idx = 0, total_num_frames - 1
-
-    indices = np.linspace(start_idx, end_idx, desired_frames, dtype=int)
-    indices = np.unique(indices)
-
-    return indices
-
-
 def get_max_height_width(videos: list["torch.Tensor"]) -> List[int]:
    """
    Get the maximum height and width across all videos in a batch.
@ -180,13 +120,15 @@ def get_resize_output_image_size(
    return height, width


-class SmolVLMVideoProcessorInitKwargs(VideosKwargs): ...
+class SmolVLMVideoProcessorInitKwargs(VideosKwargs):
+    max_image_size: dict[str, int] = None


@requires(backends=("torchvision",))
 class SmolVLMVideoProcessor(BaseVideoProcessor):
    resample = PILImageResampling.LANCZOS
    size = {"longest_edge": 4 * 364}
+    max_image_size = {"longest_edge": 364}
    image_mean = IMAGENET_STANDARD_MEAN
    image_std = IMAGENET_STANDARD_STD
    do_resize = True
@ -194,11 +136,21 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
    do_normalize = True
    do_convert_rgb = True
    do_pad = True
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
    valid_kwargs = SmolVLMVideoProcessorInitKwargs
    model_input_names = ["pixel_values", "pixel_attention_mask"]

    def __init__(self, **kwargs: Unpack[SmolVLMVideoProcessorInitKwargs]):
        super().__init__(**kwargs)
+        # For BC pop values from `config.video_sampling`. In official config `video_sampling` is guaranteed to be present
+        # We check for `Noneness` only for certain tests such as `test_init_without_params`
+        if "size" in kwargs and "video_sampling" in kwargs:
+            kwargs["video_sampling"]["video_size"] = kwargs["size"]
+
+        if "video_sampling" in kwargs:
+            self.num_frames = kwargs["video_sampling"]["max_frames"]
+            self.fps = kwargs["video_sampling"]["fps"]
+            self.size = get_size_dict(kwargs["video_sampling"]["video_size"], default_to_square=self.default_to_square)

    def resize(
        self,
@ -240,12 +192,20 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
            new_size = (size.height, size.width)
        else:
            raise ValueError(f"Size must contain 'height' and 'width' keys, or 'longest_edge' key. Got {size}.")
-        return F.resize(video, new_size, interpolation=interpolation, antialias=antialias)
+
+        video = F.resize(video, new_size, interpolation=interpolation, antialias=antialias)
+
+        # Resize again to match image processor when `do_image_splitting=False`. Frames have to be squared to `max_image_size`
+        # NOTE: videos are always processoed without image splitting
+        max_size = self.max_image_size["longest_edge"], self.max_image_size["longest_edge"]
+        video = F.resize(video, max_size, interpolation=interpolation, antialias=antialias)
+        return video

    def pad(
        self,
        video: "torch.Tensor",
        padded_size: tuple[int, int],
+        max_num_frames: int,
        fill: int = 0,
        return_pixel_mask: bool = True,
    ):
@ -255,24 +215,28 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
                Video to pad.
            padded_size (`Tuple[int, int]`):
                Height and width to pad.
+            max_num_frames (`int`):
+                The maximum number of frames to which video will be padded.
            fill (`int`, *optional*):
                The value to use for the padding.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
                Whether to return a pixel mask.
        """
        original_size = video.size()[-2:]
-        padding_bottom = padded_size[0] - original_size[0]
-        padding_right = padded_size[1] - original_size[1]
-        if padding_bottom < 0 or padding_right < 0:
+        padding_height = padded_size[0] - original_size[0]
+        padding_width = padded_size[1] - original_size[1]
+        padding_frame = max_num_frames - video.shape[0]
+        if padding_width < 0 or padding_height < 0:
            raise ValueError(
                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
                f"original size. Got padded size: {padded_size}, original size: {original_size}."
            )
        if original_size != padded_size:
-            padding = [0, 0, padding_right, padding_bottom]
+            padding = [0, padding_width, 0, padding_height, 0, 0, 0, padding_frame]
            video = F.pad(video, padding, fill=fill)

        # Make a pixel mask for the video, where 1 indicates a valid pixel and 0 indicates padding.
+        # Mask shape is (num_frames, height, width) so we omit the channel dim
        pixel_mask = None
        if return_pixel_mask:
            pixel_mask = torch.zeros_like(video[..., 0, :, :], dtype=torch.int64)
@ -280,9 +244,79 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):

        return video, pixel_mask

+    def sample_frames(
+        self,
+        video: "torch.Tensor",
+        metadata: Union[VideoMetadata, dict],
+        num_frames: Optional[int] = None,
+        fps: Optional[int] = None,
+        skip_secs: Optional[int] = 1,
+    ):
+        """
+        Video sampling function which:
+            - Uses `num_frames` (if provided) or calculates it from `fps` and metadata.
+            - Applies a basic center-skip if fewer frames than available, otherwise
+                optionally skips `skip_secs` from both the start and end.
+            - Uniformly samples the desired number of frames between the start and end indices.
+
+        Args:
+            video (`torch.Tensor`):
+                Video that need to be sampled.
+            metadata (`VideoMetadata`):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            fps (`int`, *optional*):
+                Target frames to sample per second. Defaults to `self.fps`.
+            skip_secs (`float`, *optional*, defaults to `1`):
+                Number of seconds to skip from the start and end if the video is long enough.
+
+        Returns:
+            torch.Tensor:
+                Sampled video frames.
+        """
+        num_frames = num_frames if num_frames is not None else self.num_frames
+        fps = fps if fps is not None else self.fps
+
+        total_num_frames = video.shape[0]
+
+        # Step 1) Estimate how many frames we'd sample at `target_fps`, fallback if target_fps <= 0
+        estimated_frames = int(round(fps * metadata["duration"]))
+
+        # Step 2) desired_frames
+        desired_frames = min(estimated_frames, num_frames)
+        if desired_frames < 1:
+            desired_frames = 1
+
+        # Step 3) center skip logic
+        start_idx = 0
+        end_idx = total_num_frames - 1
+
+        if skip_secs > 0 and (metadata["duration"] - 2 * skip_secs) > (num_frames * fps):
+            start_idx = int(skip_secs * metadata["fps"])
+            end_idx = int(total_num_frames - skip_secs * metadata["fps"])
+
+        start_idx = max(0, start_idx)
+        end_idx = min(end_idx, total_num_frames - 1)
+        if start_idx >= end_idx:
+            start_idx, end_idx = 0, total_num_frames - 1
+
+        indices = np.linspace(start_idx, end_idx, desired_frames, dtype=int)
+        indices = np.unique(indices)
+        video = video[indices].contiguous()
+
+        timestamps = []
+        for idx in indices:
+            sec = idx / metadata["fps"]
+            mm = int(sec // 60)
+            ss = int(sec % 60)
+            timestamps.append([mm, ss])
+        return video, timestamps, int(metadata["duration"])
+
    def _preprocess(
        self,
        videos: List["torch.Tensor"],
+        video_metadata: Union[List[VideoMetadata], List[dict]],
        do_convert_rgb: bool,
        do_resize: bool,
        size: SizeDict,
@ -291,13 +325,38 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
        rescale_factor: float,
        do_normalize: bool,
        do_pad: bool,
+        do_sample_frames: bool,
        image_mean: Optional[Union[float, List[float]]],
        image_std: Optional[Union[float, List[float]]],
+        fps: Optional[int] = None,
+        num_frames: Optional[int] = None,
+        skip_secs: Optional[int] = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ):
        # Group videos by size for batched resizing
-        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
+        if do_sample_frames:
+            if video_metadata[0] is None:
+                raise ValueError(
+                    "Frame sampling is enabled but no video metadata was found. SmolVLM requires metadata to correctly sample frames. "
+                    "Please pass in `VideoMetadata` object per each input video or set `do_sample_frames=False`"
+                )
+            processed_videos = []
+            timestamps_list, durations_list = [], []
+            for video, metadata in zip(videos, video_metadata):
+                video, timestamps, duration = self.sample_frames(video, metadata, num_frames, fps, skip_secs)
+                timestamps_list.append(timestamps)
+                durations_list.append(duration)
+                processed_videos.append(video)
+        else:
+            # Assume 24 fps by default and prepare timestamps for the whole video when all frames are sampled
+            processed_videos = videos
+            timestamps_list = [
+                [(int((idx / 24) // 60), int((idx / 24) % 60)) for idx in range(len(video))] for video in videos
+            ]
+            durations_list = [len(video) // 24 for video in videos]
+
+        grouped_videos, grouped_videos_index = group_videos_by_shape(processed_videos)
        resized_videos_grouped = {}
        for shape, stacked_videos in grouped_videos.items():
            if do_convert_rgb:
@ -319,12 +378,15 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):

        if do_pad:
            pad_size = get_max_height_width(processed_videos)
+            max_num_frames = max(len(video) for video in processed_videos)
            grouped_videos, grouped_videos_index = group_videos_by_shape(processed_videos)
            processed_padded_mask_grouped = {}
            processed_videos_grouped = {}

            for shape, stacked_videos in grouped_videos.items():
-                stacked_videos, padded_masks = self.pad(stacked_videos, padded_size=pad_size)
+                stacked_videos, padded_masks = self.pad(
+                    stacked_videos, padded_size=pad_size, max_num_frames=max_num_frames
+                )
                processed_videos_grouped[shape] = stacked_videos
                processed_padded_mask_grouped[shape] = padded_masks

@ -332,7 +394,7 @@ class SmolVLMVideoProcessor(BaseVideoProcessor):
            pixel_attention_mask = reorder_videos(processed_padded_mask_grouped, grouped_videos_index)

        processed_videos = torch.stack(processed_videos, dim=0) if return_tensors else processed_videos
-        data = {"pixel_values": processed_videos}
+        data = {"pixel_values": processed_videos, "timestamps": timestamps_list, "durations": durations_list}

        if do_pad:
            data["pixel_attention_mask"] = (
--- a/src/transformers/models/video_llava/video_processing_video_llava.py
+++ b/src/transformers/models/video_llava/video_processing_video_llava.py
@ -46,6 +46,7 @@ class VideoLlavaVideoProcessor(BaseVideoProcessor):
    do_rescale = True
    do_normalize = True
    do_convert_rgb = True
+    do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
    valid_kwargs = VideoLlavaFastVideoProcessorInitKwargs
    model_input_names = ["pixel_values_videos"]

--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@ -24,7 +24,7 @@ import typing
 import warnings
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, List, Optional, TypedDict, Union
+from typing import Any, Dict, Optional, TypedDict, Union

 import numpy as np
 import typing_extensions
@ -33,9 +33,9 @@ from huggingface_hub.errors import EntryNotFoundError
 from .audio_utils import load_audio
 from .dynamic_module_utils import custom_object_save
 from .feature_extraction_utils import BatchFeature
-from .image_utils import ChannelDimension, ImageInput, is_valid_image, is_vision_available, load_image
+from .image_utils import ChannelDimension, is_valid_image, is_vision_available, load_image
 from .utils.chat_template_utils import render_jinja_template
-from .video_utils import VideoInput, load_video
+from .video_utils import VideoMetadata, load_video


 if is_vision_available():
@ -64,6 +64,7 @@ from .utils import (
    list_repo_templates,
    logging,
 )
+from .utils.deprecation import deprecate_kwarg


 logger = logging.get_logger(__name__)
@ -235,6 +236,14 @@ class VideosKwargs(TypedDict, total=False):
            Whether to pad the video to the `(max_height, max_width)` of the videos in the batch.
        do_center_crop (`bool`, *optional*):
            Whether to center crop the video.
+        do_sample_frames (`bool`, *optional*):
+            Whether to sample frames from the video before processing or to process the whole video.
+        video_metadata (`VideoMetadata`, *optional*):
+            Metadata of the video containing information about total duration, fps and total number of frames.
+        num_frames (`int`, *optional*):
+            Maximum number of frames to sample when `do_sample_frames=True`.
+        fps (`int`, *optional*):
+            Target frames to sample per second when `do_sample_frames=True`.
        crop_size (`Dict[str, int]`, *optional*):
            Desired output size when applying center-cropping.
        data_format (`ChannelDimension` or `str`, *optional*):
@ -260,6 +269,10 @@ class VideosKwargs(TypedDict, total=False):
    data_format: Optional[ChannelDimension]
    input_data_format: Optional[Union[str, ChannelDimension]]
    device: Optional[str]
+    do_sample_frames: Optional[bool]
+    video_metadata: Optional[Union[VideoMetadata, dict]]
+    fps: Optional[int]
+    num_frames: Optional[int]


 class AudioKwargs(TypedDict, total=False):
@ -409,9 +422,6 @@ class ChatTemplateLoadKwargs(TypedDict, total=False):
        The backend to use when loading the video which will be used only when there are videos in the conversation.
        Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav" because it is the only backend
        that supports all types of sources to load from.
-    video_fps (`int`, *optional*):
-        Number of frames to sample per second. Should be passed only when `num_frames=None`.
-        If not specified and `num_frames==None`, all frames are sampled.
    sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
@ -424,9 +434,7 @@ class ChatTemplateLoadKwargs(TypedDict, total=False):
                return np.linspace(start_idx, end_idx, num_frames, dtype=int)
    """

-    num_frames: Optional[int] = None
    video_load_backend: Optional[str] = "pyav"
-    video_fps: Optional[int] = None
    sampling_rate: Optional[int] = 16_000
    load_audio_from_video: Optional[bool] = False

@ -1371,40 +1379,7 @@ class ProcessorMixin(PushToHubMixin):
            )
        return {arg_name: arg_value for arg_value, arg_name in zip(args, self.optional_call_args)}

-    def _process_messages_for_chat_template(
-        self,
-        conversation: List[List[Dict[str, str]]],
-        batch_images: List[ImageInput],
-        batch_videos: List[VideoInput],
-        batch_video_metadata: List[List[Dict[str, any]]],
-        **mm_load_kwargs: Unpack[ChatTemplateLoadKwargs],
-    ):
-        """
-        Used within `apply_chat_template` when a model has a special way to process conversation history. For example,
-        video models might want to specify in the prompt the duration of video or which frame indices at which timestamps
-        were sampled. This information cannot be accessed before the video is loaded.
-
-        For most models it is a no-op, and must be overridden by model processors which require special processing.
-
-        Args:
-            conversation (`List[Dict, str, str]`):
-                The conversation to process. Always comes in batched format.
-            batch_images (`List[List[ImageInput]]`):
-                Batch of images that were loaded from url/path defined in the conversation. The images
-                are ordered in the same way as in the conversation. Comes in nested list format, one list of `PIL` images
-                per batch.
-            batch_videos (`List[List[ImageInput]]`):
-                Batch of videos that were loaded from url/path defined in the conversation. The videos
-                are ordered in the samm way as in the conversation. Comes in nested list format, one list of 4D video arrays
-                per batch.
-            batch_video_metadata (`List[List[Dict[[str, any]]]]`):
-                Batch of metadata returned from loading videos. That includes video fps, duration and total number of framer in original video.
-                Metadata are ordered in the same way as `batch_videos`. Comes in nested list format, one list of 4D video arrays
-                per batch.
-
-        """
-        return conversation
-
+    @deprecate_kwarg("video_fps", version="4.58", new_name="fps")
    def apply_chat_template(
        self,
        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
@ -1423,7 +1398,7 @@ class ProcessorMixin(PushToHubMixin):
            {
                "role": "user",
                "content": [
-                    {"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
                    {"type": "text", "text": "Please describe this image in detail."},
                ],
            },
@ -1436,7 +1411,6 @@ class ProcessorMixin(PushToHubMixin):
                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
                chat template is used.
        """
-
        if chat_template is None:
            if isinstance(self.chat_template, dict) and "default" in self.chat_template:
                chat_template = self.chat_template["default"]
@ -1545,16 +1519,12 @@ class ProcessorMixin(PushToHubMixin):
                            metadata = None
                            logger.warning(
                                "When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
-                                "If your model uses this metadata during processing, please load the whole video and let the model sample frames instead."
+                                "If your model requires metadata during processing, please load the whole video and let the processor sample frames instead."
                            )
                        else:
-                            # TODO: raushan, should be `self.video_processor.load_video_for_model` when API is added
-                            video, metadata = self._load_video_for_model(
+                            video, metadata = load_video(
                                fname,
-                                num_frames=mm_load_kwargs.get("num_frames", None),
-                                fps=mm_load_kwargs.get("video_fps", None),
                                backend=mm_load_kwargs["video_load_backend"],
-                                **kwargs,
                            )
                        videos.append(video)
                        video_metadata.append(metadata)
@ -1567,15 +1537,6 @@ class ProcessorMixin(PushToHubMixin):
                    batch_videos.append(videos)
                    batch_video_metadata.append(video_metadata)

-            # Process conversation with video/image information if needed. Then convert into a prompt using Jinja template
-            conversations = self._process_messages_for_chat_template(
-                conversations,
-                batch_images=batch_images,
-                batch_videos=batch_videos,
-                batch_video_metadata=batch_video_metadata,
-                **processed_kwargs["mm_load_kwargs"],
-            )
-
        prompt, generation_indices = render_jinja_template(
            conversations=conversations,
            chat_template=chat_template,
@ -1597,11 +1558,17 @@ class ProcessorMixin(PushToHubMixin):
            if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
                kwargs["add_special_tokens"] = False

+            # Always sample frames by default unless explicitly set to `False` by users. If users do not pass `num_frames`/`video_fps`
+            # sampling should not done for BC.
+            if "do_sample_frames" not in kwargs and ("fps" in kwargs or "num_frames" in kwargs):
+                kwargs["do_sample_frames"] = True
+
            out = self(
                text=prompt,
                images=batch_images if batch_images else None,
                videos=batch_videos if batch_videos else None,
                audio=batch_audios if batch_audios else None,
+                video_metadata=batch_video_metadata,
                **kwargs,
            )
            if return_dict:
@ -1626,38 +1593,6 @@ class ProcessorMixin(PushToHubMixin):
                return out["input_ids"]
        return prompt

-    # TODO: raushan, has to be public method under `VideoProcessorBase` when API is added
-    # Keep private so we can simply remove when needed
-    def _load_video_for_model(
-        self,
-        video: Union[str, "VideoInput"],
-        num_frames: Optional[int] = None,
-        fps: Optional[int] = None,
-        backend: str = "opencv",
-        **kwargs,
-    ) -> np.array:
-        """
-        Loads `video` to a numpy array.
-
-        Args:
-            video (`str` or `VideoInput`):
-                The video to convert to the numpy array format. Can be a link to video or local path.
-            num_frames (`int`, *optional*):
-                Number of frames to sample uniformly. If not passed, the whole video is loaded.
-            fps (`int`, *optional*):
-                Number of frames to sample per second. Should be passed only when `num_frames=None`.
-                If not specified and `num_frames==None`, all frames are sampled.
-            backend (`str`, *optional*, defaults to `"opencv"`):
-                The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".
-
-        Returns:
-            Tuple[`np.array`, Dict]: A tuple containing:
-                - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
-                - Metadata dictionary.
-        """
-        video, metadata = load_video(video, num_frames, fps=fps, backend=backend)
-        return video, metadata
-
    def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
        """
        Post-process the output of a vlm to decode the text.
--- a/src/transformers/video_processing_utils.py
+++ b/src/transformers/video_processing_utils.py
@ -51,6 +51,7 @@ from .utils import (
 from .utils.import_utils import requires
 from .video_utils import (
    VideoInput,
+    VideoMetadata,
    group_videos_by_shape,
    load_video,
    make_batched_videos,
@ -118,6 +119,14 @@ BASE_VIDEO_PROCESSOR_DOCSTRING = r"""
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `self.image_std`):
            Whether to convert the video to RGB.
+        video_metadata (`VideoMetadata`, *optional*):
+            Metadata of the video containing information about total duration, fps and total number of frames.
+        do_sample_frames (`int`, *optional*, defaults to `self.do_sample_frames`):
+            Whether to sample frames from the video before processing or to process the whole video.
+        num_frames (`int`, *optional*, defaults to `self.num_frames`):
+            Maximum number of frames to sample when `do_sample_frames=True`.
+        fps (`int`, *optional*, defaults to `self.fps`):
+            Target frames to sample per second when `do_sample_frames=True`.
        return_tensors (`str` or `TensorType`, *optional*):
            Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
        data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
@ -157,6 +166,10 @@ class BaseVideoProcessor(BaseImageProcessorFast):
    rescale_factor = 1 / 255
    do_normalize = None
    do_convert_rgb = None
+    do_sample_frames = None
+    fps = None
+    num_frames = None
+    video_metadata = None
    valid_kwargs = VideosKwargs
    model_input_names = ["pixel_values_videos"]

@ -219,9 +232,67 @@ class BaseVideoProcessor(BaseImageProcessorFast):
        video = (1 - alpha[..., None, :, :]) * 255 + alpha[..., None, :, :] * video[..., :3, :, :]
        return video

+    def sample_frames(
+        self,
+        video: "torch.Tensor",
+        metadata: Optional[Union[VideoMetadata, dict]] = None,
+        num_frames: Optional[int] = None,
+        fps: Optional[int] = None,
+    ):
+        """
+        Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
+        If `fps` is passed along with metadata, `fps` frames per second are sampled uniformty. Arguments `num_frames`
+        and `fps` are mutually exclusive.
+
+        Args:
+            video (`torch.Tensor`):
+                Video that need to be sampled.
+            metadata (`VideoMetadata`, *optional*):
+                Metadata of the video containing information about total duration, fps and total number of frames.
+            num_frames (`int`, *optional*):
+                Maximum number of frames to sample. Defaults to `self.num_frames`.
+            fps (`int`, *optional*):
+                Target frames to sample per second. Defaults to `self.fps`.
+
+        Returns:
+            torch.Tensor:
+                Sampled video frames.
+        """
+        if fps is not None and num_frames is not None:
+            raise ValueError(
+                "`num_frames`, `fps`, and `sample_indices_fn` are mutually exclusive arguments, please use only one!"
+            )
+
+        num_frames = num_frames if num_frames is not None else self.num_frames
+        fps = fps if fps is not None else self.fps
+        total_num_frames = video.shape[0]
+
+        # If num_frames is not given but fps is, calculate num_frames from fps
+        if num_frames is None and fps is not None:
+            if metadata is None:
+                raise ValueError(
+                    "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
+                    "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
+                )
+            num_frames = int(total_num_frames / metadata["fps"] * fps)
+
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"Video can't be sampled. The `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
+            )
+
+        if num_frames is not None:
+            indices = torch.arange(0, total_num_frames, total_num_frames / num_frames).int()
+        else:
+            indices = torch.arange(0, total_num_frames).int()
+
+        video = video[indices].contiguous()
+        return video
+
    def _prepare_input_videos(
        self,
        videos: VideoInput,
+        video_metadata: VideoMetadata = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        device: Optional["torch.device"] = None,
    ) -> List["torch.Tensor"]:
@ -229,6 +300,11 @@ class BaseVideoProcessor(BaseImageProcessorFast):
        Prepare the input videos for processing.
        """
        videos = make_batched_videos(videos)
+        if video_metadata is not None:
+            batch_metadata = [metadata for batch_list in video_metadata for metadata in batch_list]
+        else:
+            batch_metadata = [None] * len(videos)
+
        processed_videos = []
        for video in videos:
            # `make_batched_videos` always returns a 4D array per video
@ -242,7 +318,7 @@ class BaseVideoProcessor(BaseImageProcessorFast):
                video = video.to(device)

            processed_videos.append(video)
-        return processed_videos
+        return processed_videos, batch_metadata

    @add_start_docstrings(BASE_VIDEO_PROCESSOR_DOCSTRING)
    def preprocess(
@ -261,7 +337,10 @@ class BaseVideoProcessor(BaseImageProcessorFast):

        input_data_format = kwargs.pop("input_data_format")
        device = kwargs.pop("device")
-        videos = self._prepare_input_videos(videos=videos, input_data_format=input_data_format, device=device)
+        video_metadata = kwargs.pop("video_metadata")
+        videos, video_metadata = self._prepare_input_videos(
+            videos=videos, video_metadata=video_metadata, input_data_format=input_data_format, device=device
+        )

        kwargs = self._further_process_kwargs(**kwargs)
        self._validate_preprocess_kwargs(**kwargs)
@ -276,11 +355,12 @@ class BaseVideoProcessor(BaseImageProcessorFast):
        kwargs.pop("default_to_square")
        kwargs.pop("data_format")

-        return self._preprocess(videos=videos, **kwargs)
+        return self._preprocess(videos=videos, video_metadata=video_metadata, **kwargs)

    def _preprocess(
        self,
        videos: List["torch.Tensor"],
+        video_metadata: Union[List[VideoMetadata], List[dict]],
        do_convert_rgb: bool,
        do_resize: bool,
        size: SizeDict,
@ -294,8 +374,18 @@ class BaseVideoProcessor(BaseImageProcessorFast):
        do_normalize: bool,
        image_mean: Optional[Union[float, List[float]]],
        image_std: Optional[Union[float, List[float]]],
+        do_sample_frames: Optional[bool] = None,
+        fps: Optional[int] = None,
+        num_frames: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
    ) -> BatchFeature:
+        if do_sample_frames:
+            # Sample video frames
+            videos = [
+                self.sample_frames(video, metadata=metadata, num_frames=num_frames, fps=fps)
+                for video, metadata in zip(videos, video_metadata)
+            ]
+
        # Group videos by size for batched resizing
        grouped_videos, grouped_videos_index = group_videos_by_shape(videos)
        resized_videos_grouped = {}
--- a/src/transformers/video_utils.py
+++ b/src/transformers/video_utils.py
@ -74,6 +74,9 @@ class VideoMetadata:
    duration: float
    video_backend: str

+    def __getitem__(self, item):
+        return getattr(self, item)
+

 def is_valid_video_frame(frame):
    return isinstance(frame, PIL.Image.Image) or (
@ -163,7 +166,7 @@ def make_batched_videos(videos) -> List[Union["np.ndarray", "torch.Tensor"]]:
        videos = [np.array(videos)[None, ...]]
    # nested batch so we need to unflatten
    elif isinstance(videos[0], (list, tuple)) and is_valid_video(videos[0][0]):
-        return [video for sublist in videos for video in sublist]
+        videos = [video for sublist in videos for video in sublist]
    return convert_pil_frames_to_video(videos)


--- a/tests/models/instructblipvideo/test_video_processing_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_video_processing_instructblipvideo.py
--- a/tests/models/internvl/test_processor_internvl.py
+++ b/tests/models/internvl/test_processor_internvl.py
@ -17,7 +17,6 @@ import shutil
 import tempfile
 import unittest

-from huggingface_hub import hf_hub_download
 from parameterized import parameterized

 from transformers import AutoProcessor, AutoTokenizer, InternVLProcessor
@ -180,77 +179,6 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            )
            images_patches_index += inputs["pixel_values"].shape[0]

-    # Override video chat_template tests as InternVLProcessor returns flattened video features
-    @require_av
-    @require_torch
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-            num_frames=8,
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        # Difference with common tests, InternVLProcessor returns flattened video features, and uses 8 frames by default
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 8)
-
    @require_torch
    @require_av
    def test_apply_chat_template_video_frame_sampling(self):
@ -393,13 +321,13 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        self.assertTrue(self.videos_input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)

-        video_len = 4 if batch_size == 1 else 3  # InternVL patches out and removes frames after processing
+        video_len = 2 if batch_size == 1 else 3  # InternVL patches out and removes frames after processing
        self.assertEqual(len(out_dict[self.videos_input_name]), video_len)
        for k in out_dict:
            self.assertIsInstance(out_dict[k], torch.Tensor)
--- a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
+++ b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
@ -407,14 +407,14 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            return_tensors=return_tensors,
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        input_name = getattr(self, input_name)
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)

-        video_len = 5760 if batch_size == 1 else 5808  # qwen pixels don't scale with bs same way as other models
+        video_len = 2880 if batch_size == 1 else 5808  # qwen pixels don't scale with bs same way as other models
        mm_len = batch_size * 1564 if modality == "image" else video_len
        self.assertEqual(len(out_dict[input_name]), mm_len)

@ -525,72 +525,6 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2904)

-    @require_av
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 145912)
-
    @require_librosa
    @require_av
    @unittest.skip(
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@ -19,7 +19,6 @@ import unittest

 import numpy as np
 import pytest
-from huggingface_hub import hf_hub_download

 from transformers import AutoProcessor, Qwen2Tokenizer
 from transformers.testing_utils import require_av, require_torch, require_vision
@ -219,14 +218,14 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            return_tensors=return_tensors,
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        input_name = getattr(self, input_name)
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)

-        video_len = 360 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
+        video_len = 180 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
        mm_len = batch_size * 192 if modality == "image" else video_len
        self.assertEqual(len(out_dict[input_name]), mm_len)

@ -346,70 +345,3 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertEqual(inputs[self.images_input_name].shape[0], 612)
        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
        self.assertEqual(inputs[self.images_input_name].shape[0], 100)
-
-    @require_av
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@ -19,7 +19,6 @@ import unittest

 import numpy as np
 import pytest
-from huggingface_hub import hf_hub_download

 from transformers import AutoProcessor, Qwen2Tokenizer
 from transformers.testing_utils import require_av, require_torch, require_vision
@ -220,14 +219,14 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            return_tensors=return_tensors,
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        input_name = getattr(self, input_name)
        self.assertTrue(input_name in out_dict)
        self.assertEqual(len(out_dict["input_ids"]), batch_size)
        self.assertEqual(len(out_dict["attention_mask"]), batch_size)

-        video_len = 360 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
+        video_len = 180 if batch_size == 1 else 320  # qwen pixels don't scale with bs same way as other models
        mm_len = batch_size * 192 if modality == "image" else video_len
        self.assertEqual(len(out_dict[input_name]), mm_len)

@ -337,73 +336,6 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 160)

-    @require_av
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 21960)
-
    def test_kwargs_overrides_custom_image_processor_kwargs(self):
        processor = self.get_processor()
        self.skip_processor_without_typed_kwargs(processor)
--- a/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_video_processing_qwen2_vl.py
@ -99,8 +99,9 @@ class Qwen2VLVideoProcessingTester:
        }

    @require_vision
-    def expected_output_video_shape(self, videos):
-        grid_t = self.num_frames // self.temporal_patch_size
+    def expected_output_video_shape(self, videos, num_frames=None):
+        num_frames = num_frames if num_frames is not None else self.num_frames
+        grid_t = num_frames // self.temporal_patch_size
        hidden_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
        seq_len = 0
        for video in videos:
@ -289,3 +290,70 @@ class Qwen2VLVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
            )[self.input_name]
            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
            self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
+
+    def test_call_sample_frames(self):
+        for video_processing_class in self.video_processor_list:
+            video_processing = video_processing_class(**self.video_processor_dict)
+
+            prev_num_frames = self.video_processor_tester.num_frames
+            self.video_processor_tester.num_frames = 8
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False,
+                return_tensors="torch",
+            )
+
+            # Force set sampling to False. No sampling is expected even when `num_frames` exists
+            video_processing.do_sample_frames = False
+
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
+            encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
+            expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(video_inputs)
+            self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
+            self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
+
+            # Set sampling to True. Video frames should be sampled with `num_frames` in the output
+            video_processing.do_sample_frames = True
+
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=4)[self.input_name]
+            encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=4)[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
+                [video_inputs[0]], num_frames=4
+            )
+            expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
+                video_inputs, num_frames=4
+            )
+            self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
+            self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
+
+            # Sample with `fps` requires metadata to infer number of frames from total duration
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3)[self.input_name]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=3)[self.input_name]
+
+            metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
+            batched_metadata = metadata * len(video_inputs)
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
+                self.input_name
+            ]
+            encoded_videos_batched = video_processing(
+                video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
+            )[self.input_name]
+            expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
+                [video_inputs[0]], num_frames=6
+            )
+            expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
+                video_inputs, num_frames=6
+            )
+            self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
+            self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
+
+            # We should raise error when asked to sample more frames than there are in input video
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
+                    self.input_name
+                ]
+
+            # Assign back the actual num frames in tester
+            self.video_processor_tester.num_frames = prev_num_frames
--- a/tests/models/smolvlm/test_processor_smolvlm.py
+++ b/tests/models/smolvlm/test_processor_smolvlm.py
@ -16,6 +16,7 @@ import shutil
 import tempfile
 import unittest
 from io import BytesIO
+from typing import Optional

 import numpy as np
 import requests
@ -63,7 +64,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        cls.bos_token = processor.tokenizer.bos_token
        cls.image_token = processor.image_token
-        cls.video_token = processor.image_token * 8  # SmolVLM uses image token and repeats it `num_frames` times
+        cls.video_token = processor.video_token
        cls.fake_image_token = processor.fake_image_token
        cls.global_img_token = processor.global_image_token

@ -93,6 +94,13 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
        }

+    def prepare_video_inputs(self, batch_size: Optional[int] = None):
+        """This function prepares a list of numpy videos."""
+        video_input = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] * 8
+        if batch_size is None:
+            return [[video_input]]
+        return [[video_input]] * batch_size
+
    def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
        text_split_images = []
        for n_h in range(image_rows):
@ -347,7 +355,6 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
                    {"type": "text", "text": "What do these images show?"},
                    {"type": "image"},
                    {"type": "image"},
-                    "What do these images show?",
                ],
            },
            {
@ -373,11 +380,8 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        )
        self.assertEqual(rendered, expected_rendered)

-    @unittest.skip(reason="SmolVLM replaced `type=video` with `type=image` in chat templates")
-    def test_apply_chat_template_video_special_processing(self):
-        pass
-
    @require_av
+    @require_torch
    def test_apply_chat_template_video_frame_sampling(self):
        # overridden because SmolVLM has special preprocessing for videos
        processor = self.get_processor()
@ -406,7 +410,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            num_frames=num_frames,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@ -421,7 +425,7 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            tokenize=True,
            return_dict=True,
            video_fps=video_fps,
-            return_tensors="np",
+            return_tensors="pt",
        )
        self.assertTrue(self.videos_input_name in out_dict_with_video)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@ -482,11 +486,11 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            do_rescale=True,
            rescale_factor=-1,
            padding="max_length",
-            max_length=76,
+            max_length=172,
        )

        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
+        self.assertEqual(len(inputs["input_ids"][0]), 172)

    @require_torch
    @require_vision
--- a/tests/models/smolvlm/test_video_processing_smolvlm.py
+++ b/tests/models/smolvlm/test_video_processing_smolvlm.py
@ -15,22 +15,16 @@

 import unittest

-import numpy as np
-
 from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+from transformers.utils import is_torchvision_available, is_vision_available

 from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs


-if is_torch_available():
-    import torch
-
 if is_vision_available():
    if is_torchvision_available():
        from transformers import SmolVLMVideoProcessor
-        from transformers.models.smolvlm.video_processing_smolvlm import get_resize_output_image_size


 class SmolVLMVideoProcessingTester:
@ -58,6 +52,7 @@ class SmolVLMVideoProcessingTester:
        self.max_resolution = max_resolution
        self.do_resize = do_resize
        self.size = size
+        self.max_image_size = size
        self.do_normalize = do_normalize
        self.image_mean = image_mean
        self.image_std = image_std
@ -71,17 +66,16 @@ class SmolVLMVideoProcessingTester:
            "image_mean": self.image_mean,
            "image_std": self.image_std,
            "do_convert_rgb": self.do_convert_rgb,
+            "max_image_size": self.max_image_size,
        }

    def expected_output_video_shape(self, videos):
-        max_height, max_width = 0, 0
-        if not isinstance(videos[0], torch.Tensor):
-            videos = [torch.tensor(np.array(video)).permute(0, -1, -3, -2) for video in videos]
-        for video in videos:
-            height, width = get_resize_output_image_size(video, self.size["longest_edge"])
-            max_height = max(height, max_height)
-            max_width = max(width, max_width)
-        return [self.num_frames, self.num_channels, max_height, max_width]
+        return [
+            self.num_frames,
+            self.num_channels,
+            self.max_image_size["longest_edge"],
+            self.max_image_size["longest_edge"],
+        ]

    def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
        videos = prepare_video_inputs(
@ -116,3 +110,58 @@ class SmolVLMVideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):

        video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict, size=42)
        self.assertEqual(video_processor.size, {"height": 42, "width": 42})
+
+    # overwrite, SmolVLM requires to have metadata no matter how we sample
+    def test_call_sample_frames(self):
+        for video_processing_class in self.video_processor_list:
+            video_processing = video_processing_class(**self.video_processor_dict)
+
+            prev_num_frames = self.video_processor_tester.num_frames
+            self.video_processor_tester.num_frames = 8
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False,
+                return_tensors="torch",
+            )
+
+            # Force set sampling to False. No sampling is expected even when `num_frames` exists
+            video_processing.do_sample_frames = False
+
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
+            encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
+            self.assertEqual(encoded_videos.shape[1], 8)
+            self.assertEqual(encoded_videos_batched.shape[1], 8)
+
+            # Set sampling to True. Video frames should be sampled with `num_frames` in the output
+            video_processing.do_sample_frames = True
+            metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
+            batched_metadata = metadata * len(video_inputs)
+
+            # Sample with `fps` requires metadata to infer number of frames from total duration
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=6, fps=3)[
+                    self.input_name
+                ]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=6, fps=3)[
+                    self.input_name
+                ]
+
+            encoded_videos = video_processing(
+                video_inputs[0], return_tensors="pt", num_frames=6, fps=3, video_metadata=metadata
+            )[self.input_name]
+            encoded_videos_batched = video_processing(
+                video_inputs, return_tensors="pt", num_frames=6, fps=3, video_metadata=batched_metadata
+            )[self.input_name]
+            self.assertEqual(encoded_videos.shape[1], 6)
+            self.assertEqual(encoded_videos_batched.shape[1], 6)
+
+            # We should raise error when asked to sample more frames than there are in input video
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=10, num_frames=20)[
+                    self.input_name
+                ]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=10, num_frames=20)[
+                    self.input_name
+                ]
+
+            # Assign back the actual num frames in tester
+            self.video_processor_tester.num_frames = prev_num_frames
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@ -507,7 +507,7 @@ class ProcessorTesterMixin:
        if "video_processor" not in self.processor_class.attributes:
            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
        processor_components = self.prepare_components()
-        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
        processor_kwargs = self.prepare_processor_dict()

        processor = self.processor_class(**processor_components, **processor_kwargs)
@ -515,7 +515,7 @@ class ProcessorTesterMixin:
        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
-        self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 167)

    def test_video_processor_defaults_preserved_by_video_kwargs(self):
        """
@ -529,7 +529,7 @@ class ProcessorTesterMixin:
        processor_components["video_processor"] = self.get_component(
            "video_processor", do_rescale=True, rescale_factor=-1
        )
-        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
        processor_kwargs = self.prepare_processor_dict()

        processor = self.processor_class(**processor_components, **processor_kwargs)
@ -553,9 +553,9 @@ class ProcessorTesterMixin:
        input_str = self.prepare_text_inputs(modality="video")
        video_input = self.prepare_video_inputs()
        inputs = processor(
-            text=input_str, videos=video_input, return_tensors="pt", max_length=112, padding="max_length"
+            text=input_str, videos=video_input, return_tensors="pt", max_length=162, padding="max_length"
        )
-        self.assertEqual(inputs[self.text_input_name].shape[-1], 112)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 162)

    def test_kwargs_overrides_default_video_processor_kwargs(self):
        if "video_processor" not in self.processor_class.attributes:
@ -564,7 +564,7 @@ class ProcessorTesterMixin:
        processor_components["video_processor"] = self.get_component(
            "video_processor", do_rescale=True, rescale_factor=1
        )
-        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=167, padding="max_length")
        processor_kwargs = self.prepare_processor_dict()

        processor = self.processor_class(**processor_components, **processor_kwargs)
@ -593,11 +593,11 @@ class ProcessorTesterMixin:
            do_rescale=True,
            rescale_factor=-1,
            padding="max_length",
-            max_length=76,
+            max_length=176,
        )

        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
-        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 176)

    def test_unstructured_kwargs_batched_video(self):
        if "video_processor" not in self.processor_class.attributes:
@ -616,13 +616,13 @@ class ProcessorTesterMixin:
            do_rescale=True,
            rescale_factor=-1,
            padding="longest",
-            max_length=76,
+            max_length=176,
        )

        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
        self.assertTrue(
            len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
-            and len(inputs[self.text_input_name][1]) < 76
+            and len(inputs[self.text_input_name][1]) < 176
        )

    def test_doubly_passed_kwargs_video(self):
@ -659,14 +659,14 @@ class ProcessorTesterMixin:
        all_kwargs = {
            "common_kwargs": {"return_tensors": "pt"},
            "videos_kwargs": {"do_rescale": True, "rescale_factor": -1},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
+            "text_kwargs": {"padding": "max_length", "max_length": 176},
        }

        inputs = processor(text=input_str, videos=video_input, **all_kwargs)
        self.skip_processor_without_typed_kwargs(processor)

        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
-        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 176)

    def test_structured_kwargs_nested_from_dict_video(self):
        if "video_processor" not in self.processor_class.attributes:
@ -682,12 +682,12 @@ class ProcessorTesterMixin:
        all_kwargs = {
            "common_kwargs": {"return_tensors": "pt"},
            "videos_kwargs": {"do_rescale": True, "rescale_factor": -1},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
+            "text_kwargs": {"padding": "max_length", "max_length": 176},
        }

        inputs = processor(text=input_str, videos=video_input, **all_kwargs)
        self.assertLessEqual(inputs[self.videos_input_name][0].mean(), 0)
-        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 176)

    # TODO: the same test, but for audio + text processors that have strong overlap in kwargs
    # TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
@ -884,7 +884,7 @@ class ProcessorTesterMixin:
            tokenize=True,
            return_dict=True,
            return_tensors=return_tensors,
-            num_frames=4,  # by default no more than 4 frames, otherwise too slow
+            num_frames=2,  # by default no more than 2 frames, otherwise too slow
        )
        input_name = getattr(self, input_name)
        self.assertTrue(input_name in out_dict)
@ -983,6 +983,21 @@ class ProcessorTesterMixin:
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), video_fps * 10)

+        # Whan `do_sample_frames=False` no sampling is done and whole video is loaded, even if number of frames is passed
+        video_fps = 1
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            do_sample_frames=False,
+            video_fps=video_fps,
+            return_tensors="pt",
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 300)
+
        # Load with `video_fps` and `num_frames` args, should raise an error
        with self.assertRaises(ValueError):
            out_dict_with_video = processor.apply_chat_template(
@ -1024,75 +1039,6 @@ class ProcessorTesterMixin:
        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 2)

-    @require_av
-    @require_torch
-    def test_apply_chat_template_video_special_processing(self):
-        """
-        Tests that models can use their own preprocessing to preprocess conversations.
-        """
-        processor = self.get_processor()
-        if processor.chat_template is None:
-            self.skipTest("Processor has no chat template")
-
-        signature = inspect.signature(processor.__call__)
-        if "videos" not in {*signature.parameters.keys()} or (
-            signature.parameters.get("videos") is not None
-            and signature.parameters["videos"].annotation == inspect._empty
-        ):
-            self.skipTest("Processor doesn't accept videos at input")
-
-        video_file_path = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
-        )
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "path": video_file_path},
-                        {"type": "text", "text": "What is shown in this video?"},
-                    ],
-                },
-            ]
-        ]
-
-        def _process_messages_for_chat_template(
-            conversation,
-            batch_images,
-            batch_videos,
-            batch_video_metadata,
-            **chat_template_kwargs,
-        ):
-            # Let us just always return a dummy prompt
-            new_msg = [
-                [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "video"},  # no need to use path, video is loaded already by this moment
-                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
-                        ],
-                    },
-                ]
-            ]
-            return new_msg
-
-        processor._process_messages_for_chat_template = _process_messages_for_chat_template
-        out_dict_with_video = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        self.assertTrue(self.videos_input_name in out_dict_with_video)
-
-        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
-        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
-        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 243)
-
    @require_librosa
    @require_av
    def test_chat_template_audio_from_video(self):
--- a/tests/test_video_processing_common.py
+++ b/tests/test_video_processing_common.py
@ -293,6 +293,59 @@ class VideoProcessingTestMixin:
                (self.video_processor_tester.batch_size, *expected_output_video_shape),
            )

+    def test_call_sample_frames(self):
+        for video_processing_class in self.video_processor_list:
+            video_processing = video_processing_class(**self.video_processor_dict)
+
+            prev_num_frames = self.video_processor_tester.num_frames
+            self.video_processor_tester.num_frames = 8
+            video_inputs = self.video_processor_tester.prepare_video_inputs(
+                equal_resolution=False,
+                return_tensors="torch",
+            )
+
+            # Force set sampling to False. No sampling is expected even when `num_frames` exists
+            video_processing.do_sample_frames = False
+
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
+            encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
+            self.assertEqual(encoded_videos.shape[1], 8)
+            self.assertEqual(encoded_videos_batched.shape[1], 8)
+
+            # Set sampling to True. Video frames should be sampled with `num_frames` in the output
+            video_processing.do_sample_frames = True
+
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
+            encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
+            self.assertEqual(encoded_videos.shape[1], 3)
+            self.assertEqual(encoded_videos_batched.shape[1], 3)
+
+            # Sample with `fps` requires metadata to infer number of frames from total duration
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3)[self.input_name]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", fps=3)[self.input_name]
+
+            metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
+            batched_metadata = metadata * len(video_inputs)
+            encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
+                self.input_name
+            ]
+            encoded_videos_batched = video_processing(
+                video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
+            )[self.input_name]
+            self.assertEqual(encoded_videos.shape[1], 6)
+            self.assertEqual(encoded_videos_batched.shape[1], 6)
+
+            # We should raise error when asked to sample more frames than there are in input video
+            with self.assertRaises(ValueError):
+                encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
+                encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
+                    self.input_name
+                ]
+
+            # Assign back the actual num frames in tester
+            self.video_processor_tester.num_frames = prev_num_frames
+
    def test_nested_input(self):
        """Tests that the processor can work with nested list where each video is a list of arrays"""
        for video_processing_class in self.video_processor_list: