Qwen 2.5 Omni: apply video defaults (#37660)

* Apply video defaults for min_pixels and max_pixels

* fps kwarg should not be a list

* Update test to account for new resizing
This commit is contained in:
Pedro Cuenca 2025-04-23 17:08:11 +02:00 committed by GitHub
parent 1e9087368c
commit 63c6331387
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 8 additions and 7 deletions

View File

@ -61,6 +61,8 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
"seconds_per_chunk": 2.0,
"position_id_per_seconds": 25,
"use_audio_in_video": False,
"min_pixels": 128 * 28 * 28,
"max_pixels": 768 * 28 * 28,
},
"audio_kwargs": {
"sampling_rate": 16000,
@ -147,7 +149,7 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
seconds_per_chunk = output_kwargs["videos_kwargs"].pop("seconds_per_chunk")
position_id_per_seconds = output_kwargs["videos_kwargs"].pop("position_id_per_seconds")
use_audio_in_video = output_kwargs["videos_kwargs"].pop("use_audio_in_video")
fps = output_kwargs["videos_kwargs"].pop("fps", None)
fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
if audio is not None:
output_kwargs["audio_kwargs"]["padding"] = "max_length" # Support "max_length" padding only here
@ -174,8 +176,7 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
if videos is not None:
videos = make_batched_videos(videos)
videos_inputs = self.image_processor(images=None, videos=videos, **output_kwargs["videos_kwargs"])
if fps is None:
fps = [2.0] * len(videos)
fps = [fps] * len(videos)
videos_inputs["video_second_per_grid"] = [
self.image_processor.temporal_patch_size / fps[i] for i in range(len(fps))
]

View File

@ -433,7 +433,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
num_frames=num_frames,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 9568)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5760)
# Load with `video_fps` arg
video_fps = 1
@ -445,7 +445,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
video_fps=video_fps,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 23920)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400)
# Load with `video_fps` and `num_frames` args, should raise an error
with self.assertRaises(ValueError):
@ -466,7 +466,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 717600)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 432000)
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
# because we assume they come from one video
@ -484,7 +484,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5704)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2904)
@require_av
def test_apply_chat_template_video_special_processing(self):