diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py index 0382ad65098..e7d08239fe9 100644 --- a/src/transformers/models/auto/video_processing_auto.py +++ b/src/transformers/models/auto/video_processing_auto.py @@ -46,12 +46,15 @@ if TYPE_CHECKING: else: VIDEO_PROCESSOR_MAPPING_NAMES = OrderedDict( [ + ("instructblip", "InstructBlipVideoVideoProcessor"), ("instructblipvideo", "InstructBlipVideoVideoProcessor"), ("internvl", "InternVLVideoProcessor"), ("llava_next_video", "LlavaNextVideoVideoProcessor"), ("llava_onevision", "LlavaOnevisionVideoProcessor"), - ("qwen2_5_vl", "Qwen2_5_VLVideoProcessor"), + ("qwen2_5_omni", "Qwen2VLVideoProcessor"), + ("qwen2_5_vl", "Qwen2VLVideoProcessor"), ("qwen2_vl", "Qwen2VLVideoProcessor"), + ("smolvlm", "SmolVLMVideoProcessor"), ("video_llava", "VideoLlavaVideoProcessor"), ] ) diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index f70fad216ce..d83af3a9f65 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -156,21 +156,17 @@ class VideoLlavaProcessor(ProcessorMixin): - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. - **pixel_values_videos** -- Pixel values to be fed to a model. Returned when `videos` is not `None`. """ - data = {} - if images is not None: - encoded_images = self.image_processor(images=images, return_tensors=return_tensors) - data.update(encoded_images) - - if videos is not None: - encoded_videos = self.video_processor(videos=videos, return_tensors=return_tensors) - data.update(encoded_videos) if isinstance(text, str): text = [text] elif not isinstance(text, list) and not isinstance(text[0], str): raise ValueError("Invalid input text. Please provide a string, or a list of strings") - if encoded_images is not None: + data = {} + if images is not None: + encoded_images = self.image_processor(images=images, return_tensors=return_tensors) + data.update(encoded_images) + height, width = get_image_size(to_numpy_array(encoded_images.get("pixel_values_images")[0])) num_image_tokens = (height // self.patch_size) * (width // self.patch_size) num_image_tokens += self.num_additional_image_tokens @@ -178,7 +174,10 @@ class VideoLlavaProcessor(ProcessorMixin): num_image_tokens -= 1 text = [sample.replace(self.image_token, self.image_token * num_image_tokens) for sample in text] - if encoded_videos is not None: + if videos is not None: + encoded_videos = self.video_processor(videos=videos, return_tensors=return_tensors) + data.update(encoded_videos) + one_video = encoded_videos.get("pixel_values_videos")[0] if isinstance(encoded_videos.get("pixel_values_videos")[0], (list, tuple)): one_video = np.array(one_video) diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index 577dd669860..3f52d8291d7 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -415,7 +415,7 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase): "llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True, cache_dir="./" ) - inputs = self.processor(self.prompt_video, videos=self.video, return_tensors="pt") + inputs = self.processor(text=self.prompt_video, videos=self.video, return_tensors="pt") # verify single forward pass inputs = inputs.to(torch_device) with torch.no_grad(): @@ -438,7 +438,7 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase): ) inputs = self.processor( - [self.prompt_video, self.prompt_video], + text=[self.prompt_video, self.prompt_video], videos=[self.video, self.video], return_tensors="pt", padding=True, @@ -465,7 +465,7 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase): ) inputs = self.processor( - [self.prompt_image, self.prompt_video], + text=[self.prompt_image, self.prompt_video], images=self.image, videos=self.video, return_tensors="pt", @@ -491,7 +491,7 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase): ) inputs_batched = self.processor( - [self.prompt_video, self.prompt_image], + text=[self.prompt_video, self.prompt_image], images=[self.image], videos=[self.video], return_tensors="pt", diff --git a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py index 5070383228b..747993cbb68 100644 --- a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py @@ -648,7 +648,12 @@ class Qwen2_5OmniModelIntegrationTest(unittest.TestCase): self.messages[0], { "role": "assistant", - "content": "The sound is glass shattering, and the dog appears to be a Labrador Retriever.", + "content": [ + { + "type": "text", + "text": "The sound is glass shattering, and the dog appears to be a Labrador Retriever.", + } + ], }, { "role": "user", @@ -687,7 +692,12 @@ class Qwen2_5OmniModelIntegrationTest(unittest.TestCase): messages = [ { "role": "system", - "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", + "content": [ + { + "type": "text", + "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", + } + ], }, { "role": "user", @@ -697,7 +707,7 @@ class Qwen2_5OmniModelIntegrationTest(unittest.TestCase): audio, _ = librosa.load(BytesIO(urlopen(audio_url).read()), sr=self.processor.feature_extractor.sampling_rate) text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - inputs = self.processor(text=[text], audio=[audio], return_tensors="pt", padding=True).to(torch_device) + inputs = self.processor(text=text, audio=[audio], return_tensors="pt", padding=True).to(torch_device) output = model.generate(**inputs, thinker_temperature=0, thinker_do_sample=False) diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 92ad5a193bf..3e81044e6b6 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -466,7 +466,7 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase): repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" ) video_file = np.load(video_file) - inputs = self.processor(prompt, videos=video_file, return_tensors="pt").to(torch_device) + inputs = self.processor(text=prompt, videos=video_file, return_tensors="pt").to(torch_device) EXPECTED_INPUT_IDS = torch.tensor([1, 3148, 1001, 29901, 29871, 13, 11008, 338, 445, 4863, 2090, 1460, 29973, 319, 1799, 9047, 13566, 29901], device=torch_device) # fmt: skip non_video_inputs = inputs["input_ids"][inputs["input_ids"] != 32001] @@ -496,9 +496,9 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase): url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) - inputs = self.processor(prompts, images=[image], videos=[video_file], padding=True, return_tensors="pt").to( - torch_device - ) + inputs = self.processor( + text=prompts, images=[image], videos=[video_file], padding=True, return_tensors="pt" + ).to(torch_device) output = model.generate(**inputs, do_sample=False, max_new_tokens=20) EXPECTED_DECODED_TEXT = [ @@ -522,7 +522,7 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase): repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" ) video_file = np.load(video_file) - inputs = self.processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16) + inputs = self.processor(text=prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16) output = model.generate(**inputs, max_new_tokens=900, do_sample=False) EXPECTED_DECODED_TEXT = "USER: \nDescribe the video in details. ASSISTANT: The video features a young child sitting on a bed, holding a book and reading it. " \ @@ -554,7 +554,7 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase): hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo_2.npy", repo_type="dataset") ) - inputs = processor(prompts, videos=[video_1, video_2], return_tensors="pt", padding=True).to(torch_device) + inputs = processor(text=prompts, videos=[video_1, video_2], return_tensors="pt", padding=True).to(torch_device) output = model.generate(**inputs, max_new_tokens=20) diff --git a/tests/utils/test_video_utils.py b/tests/utils/test_video_utils.py index 96e7e62638f..441838ffcab 100644 --- a/tests/utils/test_video_utils.py +++ b/tests/utils/test_video_utils.py @@ -71,8 +71,8 @@ class BaseVideoProcessorTester(unittest.TestCase): # Test a list of videos is converted to a list of 1 video video = get_random_video(16, 32) - video = [PIL.Image.fromarray(frame) for frame in video] - videos_list = make_batched_videos(video) + pil_video = [PIL.Image.fromarray(frame) for frame in video] + videos_list = make_batched_videos(pil_video) self.assertIsInstance(videos_list, list) self.assertIsInstance(videos_list[0], np.ndarray) self.assertEqual(videos_list[0].shape, (8, 16, 32, 3)) @@ -80,8 +80,8 @@ class BaseVideoProcessorTester(unittest.TestCase): # Test a nested list of videos is not modified video = get_random_video(16, 32) - video = [PIL.Image.fromarray(frame) for frame in video] - videos = [video, video] + pil_video = [PIL.Image.fromarray(frame) for frame in video] + videos = [pil_video, pil_video] videos_list = make_batched_videos(videos) self.assertIsInstance(videos_list, list) self.assertIsInstance(videos_list[0], np.ndarray)