mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 13:20:12 +06:00
[qwen-omni] fix processor (#37493)
* fix * delete print * accept kwargs in overriden models as well * remove duplicate
This commit is contained in:
parent
d228f50acc
commit
cb39f7dd5b
@ -244,11 +244,13 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
|
||||
curr_video_grid_thw = next(video_grid_thw)
|
||||
height = curr_video_grid_thw[1] // self.image_processor.merge_size
|
||||
width = curr_video_grid_thw[2] // self.image_processor.merge_size
|
||||
video_token_indices = np.arange(curr_video_grid_thw[0]).view(-1, 1, 1)
|
||||
video_token_indices = video_token_indices.expand(-1, height, width).flatten()
|
||||
video_token_indices = np.arange(curr_video_grid_thw[0]).reshape(-1, 1, 1)
|
||||
video_token_indices = np.broadcast_to(
|
||||
video_token_indices, (video_token_indices.shape[0], height, width)
|
||||
).reshape(-1)
|
||||
video_token_indices = (
|
||||
video_token_indices * next(video_second_per_grid) * position_id_per_seconds
|
||||
).long()
|
||||
)
|
||||
|
||||
tokens_per_chunk = int(position_id_per_seconds * seconds_per_chunk)
|
||||
video_chunk_indexes = self.get_chunked_index(video_token_indices, tokens_per_chunk)
|
||||
|
@ -436,6 +436,7 @@ class SmolVLMProcessor(ProcessorMixin):
|
||||
fps: Optional[int] = None,
|
||||
backend: str = "opencv",
|
||||
skip_secs: int = 0.0,
|
||||
**kwargs,
|
||||
) -> np.array:
|
||||
"""
|
||||
Loads `video` to a numpy array.
|
||||
|
@ -1427,7 +1427,6 @@ class ProcessorMixin(PushToHubMixin):
|
||||
|
||||
# Fill sets of kwargs that should be used by different parts of template
|
||||
processed_kwargs = {
|
||||
"processor_kwargs": {},
|
||||
"mm_load_kwargs": {},
|
||||
"template_kwargs": {},
|
||||
}
|
||||
@ -1551,14 +1550,14 @@ class ProcessorMixin(PushToHubMixin):
|
||||
# without actionable solution for users
|
||||
single_prompt = prompt[0] if is_batched else prompt
|
||||
if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
|
||||
processed_kwargs["processor_kwargs"]["add_special_tokens"] = False
|
||||
kwargs["add_special_tokens"] = False
|
||||
|
||||
out = self(
|
||||
text=prompt,
|
||||
images=batch_images if batch_images else None,
|
||||
videos=batch_videos if batch_videos else None,
|
||||
audio=batch_audios if batch_audios else None,
|
||||
**processed_kwargs["processor_kwargs"],
|
||||
**kwargs,
|
||||
)
|
||||
if return_dict:
|
||||
return out
|
||||
@ -1574,6 +1573,7 @@ class ProcessorMixin(PushToHubMixin):
|
||||
num_frames: Optional[int] = None,
|
||||
fps: Optional[int] = None,
|
||||
backend: str = "opencv",
|
||||
**kwargs,
|
||||
) -> np.array:
|
||||
"""
|
||||
Loads `video` to a numpy array.
|
||||
|
@ -228,6 +228,10 @@ class GenerationTesterMixin:
|
||||
"video_token_index",
|
||||
"video_token_id",
|
||||
"vision_start_token_id",
|
||||
"audio_token_index",
|
||||
"audio_start_token_id",
|
||||
"audio_end_token_id",
|
||||
"vision_end_token_id",
|
||||
]:
|
||||
token_index = getattr(config, key, None)
|
||||
if token_index is None and hasattr(self, "model_tester"):
|
||||
|
Loading…
Reference in New Issue
Block a user