diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py
index 6e618b1ce59..f6f40c1bd83 100644
--- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py
@@ -176,7 +176,7 @@ class LlavaOnevisionConfig(PretrainedConfig):
                 patch_size=14,
                 image_size=384,
                 num_hidden_layers=26,
-                num_attention_heads=14,
+                num_attention_heads=16,
                 vision_use_head=False,
             )
 
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
index ada719a70e0..72f63c37ffd 100644
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -434,6 +434,10 @@ class SmolVLMProcessor(ProcessorMixin):
         if chat_template is None and has_video:
             # re-assign to the correct default template for BC, if user is not requesting their own template
             chat_template = DEFAULT_CHAT_TEMPLATE
+
+        kwargs.setdefault("num_frames", self.video_processor.num_frames)
+        kwargs.setdefault("fps", self.video_processor.fps)
+
         return super().apply_chat_template(conversation, chat_template, **kwargs)
 
 
diff --git a/tests/models/smolvlm/test_modeling_smolvlm.py b/tests/models/smolvlm/test_modeling_smolvlm.py
index 280399eb6b8..135043e9860 100644
--- a/tests/models/smolvlm/test_modeling_smolvlm.py
+++ b/tests/models/smolvlm/test_modeling_smolvlm.py
@@ -536,23 +536,24 @@ class SmolVLMForConditionalGenerationIntegrationTest(unittest.TestCase):
                 ).content
             )
         )
-        self.image2 = Image.open(
-            BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
-        )
-        self.image3 = Image.open(
-            BytesIO(
-                requests.get(
-                    "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
-                ).content
-            )
-        )
+
+        self.video_messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "path": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov",
+                    },
+                    {"type": "text", "text": "Describe this video in detail"},
+                ],
+            },
+        ]
 
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
     @slow
-    # TODO (Orr?) this is a dummy test to check if the model generates things that make sense.
-    # Needs to be expanded to a tiny video
     def test_integration_test(self):
         model = SmolVLMForConditionalGeneration.from_pretrained(
             "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
@@ -571,3 +572,26 @@ class SmolVLMForConditionalGenerationIntegrationTest(unittest.TestCase):
 
         expected_generated_text = "\n\n\n\nIn this image, we see a view of the Statue of Liberty and the"
         self.assertEqual(generated_texts[0], expected_generated_text)
+
+    @slow
+    def test_integration_test_video(self):
+        model = SmolVLMForConditionalGeneration.from_pretrained(
+            "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+        )
+
+        # Create inputs
+        inputs = self.processor.apply_chat_template(
+            self.video_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(device=torch_device, dtype=torch.bfloat16)
+
+        generated_ids = model.generate(**inputs, max_new_tokens=20)
+        generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        expected_generated_text = 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature'  # fmt: skip
+        self.assertEqual(generated_texts[0], expected_generated_text)