mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 21:30:07 +06:00
[qwen-omni] fix processor (#37493)
* fix * delete print * accept kwargs in overriden models as well * remove duplicate
This commit is contained in:
parent
d228f50acc
commit
cb39f7dd5b
@ -244,11 +244,13 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
|
|||||||
curr_video_grid_thw = next(video_grid_thw)
|
curr_video_grid_thw = next(video_grid_thw)
|
||||||
height = curr_video_grid_thw[1] // self.image_processor.merge_size
|
height = curr_video_grid_thw[1] // self.image_processor.merge_size
|
||||||
width = curr_video_grid_thw[2] // self.image_processor.merge_size
|
width = curr_video_grid_thw[2] // self.image_processor.merge_size
|
||||||
video_token_indices = np.arange(curr_video_grid_thw[0]).view(-1, 1, 1)
|
video_token_indices = np.arange(curr_video_grid_thw[0]).reshape(-1, 1, 1)
|
||||||
video_token_indices = video_token_indices.expand(-1, height, width).flatten()
|
video_token_indices = np.broadcast_to(
|
||||||
|
video_token_indices, (video_token_indices.shape[0], height, width)
|
||||||
|
).reshape(-1)
|
||||||
video_token_indices = (
|
video_token_indices = (
|
||||||
video_token_indices * next(video_second_per_grid) * position_id_per_seconds
|
video_token_indices * next(video_second_per_grid) * position_id_per_seconds
|
||||||
).long()
|
)
|
||||||
|
|
||||||
tokens_per_chunk = int(position_id_per_seconds * seconds_per_chunk)
|
tokens_per_chunk = int(position_id_per_seconds * seconds_per_chunk)
|
||||||
video_chunk_indexes = self.get_chunked_index(video_token_indices, tokens_per_chunk)
|
video_chunk_indexes = self.get_chunked_index(video_token_indices, tokens_per_chunk)
|
||||||
|
@ -436,6 +436,7 @@ class SmolVLMProcessor(ProcessorMixin):
|
|||||||
fps: Optional[int] = None,
|
fps: Optional[int] = None,
|
||||||
backend: str = "opencv",
|
backend: str = "opencv",
|
||||||
skip_secs: int = 0.0,
|
skip_secs: int = 0.0,
|
||||||
|
**kwargs,
|
||||||
) -> np.array:
|
) -> np.array:
|
||||||
"""
|
"""
|
||||||
Loads `video` to a numpy array.
|
Loads `video` to a numpy array.
|
||||||
|
@ -1427,7 +1427,6 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
|
|
||||||
# Fill sets of kwargs that should be used by different parts of template
|
# Fill sets of kwargs that should be used by different parts of template
|
||||||
processed_kwargs = {
|
processed_kwargs = {
|
||||||
"processor_kwargs": {},
|
|
||||||
"mm_load_kwargs": {},
|
"mm_load_kwargs": {},
|
||||||
"template_kwargs": {},
|
"template_kwargs": {},
|
||||||
}
|
}
|
||||||
@ -1551,14 +1550,14 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
# without actionable solution for users
|
# without actionable solution for users
|
||||||
single_prompt = prompt[0] if is_batched else prompt
|
single_prompt = prompt[0] if is_batched else prompt
|
||||||
if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
|
if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
|
||||||
processed_kwargs["processor_kwargs"]["add_special_tokens"] = False
|
kwargs["add_special_tokens"] = False
|
||||||
|
|
||||||
out = self(
|
out = self(
|
||||||
text=prompt,
|
text=prompt,
|
||||||
images=batch_images if batch_images else None,
|
images=batch_images if batch_images else None,
|
||||||
videos=batch_videos if batch_videos else None,
|
videos=batch_videos if batch_videos else None,
|
||||||
audio=batch_audios if batch_audios else None,
|
audio=batch_audios if batch_audios else None,
|
||||||
**processed_kwargs["processor_kwargs"],
|
**kwargs,
|
||||||
)
|
)
|
||||||
if return_dict:
|
if return_dict:
|
||||||
return out
|
return out
|
||||||
@ -1574,6 +1573,7 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
num_frames: Optional[int] = None,
|
num_frames: Optional[int] = None,
|
||||||
fps: Optional[int] = None,
|
fps: Optional[int] = None,
|
||||||
backend: str = "opencv",
|
backend: str = "opencv",
|
||||||
|
**kwargs,
|
||||||
) -> np.array:
|
) -> np.array:
|
||||||
"""
|
"""
|
||||||
Loads `video` to a numpy array.
|
Loads `video` to a numpy array.
|
||||||
|
@ -228,6 +228,10 @@ class GenerationTesterMixin:
|
|||||||
"video_token_index",
|
"video_token_index",
|
||||||
"video_token_id",
|
"video_token_id",
|
||||||
"vision_start_token_id",
|
"vision_start_token_id",
|
||||||
|
"audio_token_index",
|
||||||
|
"audio_start_token_id",
|
||||||
|
"audio_end_token_id",
|
||||||
|
"vision_end_token_id",
|
||||||
]:
|
]:
|
||||||
token_index = getattr(config, key, None)
|
token_index = getattr(config, key, None)
|
||||||
if token_index is None and hasattr(self, "model_tester"):
|
if token_index is None and hasattr(self, "model_tester"):
|
||||||
|
Loading…
Reference in New Issue
Block a user