mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Qwen 2.5 Omni: apply video defaults (#37660)
* Apply video defaults for min_pixels and max_pixels * fps kwarg should not be a list * Update test to account for new resizing
This commit is contained in:
parent
1e9087368c
commit
63c6331387
@ -61,6 +61,8 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
|
||||
"seconds_per_chunk": 2.0,
|
||||
"position_id_per_seconds": 25,
|
||||
"use_audio_in_video": False,
|
||||
"min_pixels": 128 * 28 * 28,
|
||||
"max_pixels": 768 * 28 * 28,
|
||||
},
|
||||
"audio_kwargs": {
|
||||
"sampling_rate": 16000,
|
||||
@ -147,7 +149,7 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
|
||||
seconds_per_chunk = output_kwargs["videos_kwargs"].pop("seconds_per_chunk")
|
||||
position_id_per_seconds = output_kwargs["videos_kwargs"].pop("position_id_per_seconds")
|
||||
use_audio_in_video = output_kwargs["videos_kwargs"].pop("use_audio_in_video")
|
||||
fps = output_kwargs["videos_kwargs"].pop("fps", None)
|
||||
fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
|
||||
|
||||
if audio is not None:
|
||||
output_kwargs["audio_kwargs"]["padding"] = "max_length" # Support "max_length" padding only here
|
||||
@ -174,8 +176,7 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
|
||||
if videos is not None:
|
||||
videos = make_batched_videos(videos)
|
||||
videos_inputs = self.image_processor(images=None, videos=videos, **output_kwargs["videos_kwargs"])
|
||||
if fps is None:
|
||||
fps = [2.0] * len(videos)
|
||||
fps = [fps] * len(videos)
|
||||
videos_inputs["video_second_per_grid"] = [
|
||||
self.image_processor.temporal_patch_size / fps[i] for i in range(len(fps))
|
||||
]
|
||||
|
@ -433,7 +433,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
num_frames=num_frames,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 9568)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5760)
|
||||
|
||||
# Load with `video_fps` arg
|
||||
video_fps = 1
|
||||
@ -445,7 +445,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
video_fps=video_fps,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 23920)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400)
|
||||
|
||||
# Load with `video_fps` and `num_frames` args, should raise an error
|
||||
with self.assertRaises(ValueError):
|
||||
@ -466,7 +466,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
return_dict=True,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 717600)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 432000)
|
||||
|
||||
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
|
||||
# because we assume they come from one video
|
||||
@ -484,7 +484,7 @@ class Qwen2_5OmniProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
return_dict=True,
|
||||
)
|
||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5704)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2904)
|
||||
|
||||
@require_av
|
||||
def test_apply_chat_template_video_special_processing(self):
|
||||
|
Loading…
Reference in New Issue
Block a user