mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 02:31:11 +06:00
Fix "test_chat_template_dict" in video LLMs (#35660)
* fix "test_chat_template_dict" in llava_onevision * Update src/transformers/models/llava_next_video/processing_llava_next_video.py Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com> * get one video calles once --------- Co-authored-by: Pavel Iakubovskii <qubvel@gmail.com>
This commit is contained in:
parent
e867b97443
commit
705aeaaa12
@ -18,6 +18,8 @@ Processor class for LLaVa-NeXT-Video.
|
||||
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_processing_utils import select_best_resolution
|
||||
from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
|
||||
@ -193,7 +195,11 @@ class LlavaNextVideoProcessor(ProcessorMixin):
|
||||
|
||||
# videos are easier, simply get frames and multiply
|
||||
if videos_inputs:
|
||||
one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
|
||||
one_video = videos_inputs.get("pixel_values_videos")[0]
|
||||
if isinstance(one_video, (list, tuple)):
|
||||
one_video = np.array(one_video)
|
||||
else:
|
||||
one_video = to_numpy_array(one_video)
|
||||
height, width = get_image_size(one_video[0])
|
||||
num_frames = one_video.shape[0] # frame dim is always after batch dim
|
||||
|
||||
|
@ -20,6 +20,8 @@ import math
|
||||
import os
|
||||
from typing import Iterable, List, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_processing_utils import select_best_resolution
|
||||
from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
|
||||
@ -164,7 +166,11 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
||||
if videos is not None:
|
||||
video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
|
||||
|
||||
one_video = to_numpy_array(video_inputs.get("pixel_values_videos")[0])
|
||||
one_video = video_inputs.get("pixel_values_videos")[0]
|
||||
if isinstance(video_inputs.get("pixel_values_videos")[0], (list, tuple)):
|
||||
one_video = np.array(one_video)
|
||||
else:
|
||||
one_video = to_numpy_array(one_video)
|
||||
height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
|
||||
num_frames = one_video.shape[0] # frame dim is always after batch dim
|
||||
patches_height_width = int(math.sqrt(self.num_image_tokens))
|
||||
|
@ -18,6 +18,8 @@ Processor class for VideoLlava.
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...image_utils import ImageInput, get_image_size, to_numpy_array
|
||||
from ...processing_utils import ProcessorMixin
|
||||
@ -165,7 +167,11 @@ class VideoLlavaProcessor(ProcessorMixin):
|
||||
num_frames = 1
|
||||
|
||||
if "pixel_values_videos" in encoded_images.keys():
|
||||
one_video = to_numpy_array(encoded_images.get("pixel_values_videos")[0])
|
||||
one_video = encoded_images.get("pixel_values_videos")[0]
|
||||
if isinstance(encoded_images.get("pixel_values_videos")[0], (list, tuple)):
|
||||
one_video = np.array(one_video)
|
||||
else:
|
||||
one_video = to_numpy_array(one_video)
|
||||
height, width = get_image_size(one_video[0])
|
||||
num_frames = one_video.shape[0] # frame dim is always after batch dim
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user