[qwen-omni] fix processor (#37493)

* fix

* delete print

* accept kwargs in overriden models as well

* remove duplicate
This commit is contained in:
Raushan Turganbay 2025-04-14 17:30:31 +02:00 committed by GitHub
parent d228f50acc
commit cb39f7dd5b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 13 additions and 6 deletions

View File

@ -244,11 +244,13 @@ class Qwen2_5OmniProcessor(ProcessorMixin):
curr_video_grid_thw = next(video_grid_thw)
height = curr_video_grid_thw[1] // self.image_processor.merge_size
width = curr_video_grid_thw[2] // self.image_processor.merge_size
video_token_indices = np.arange(curr_video_grid_thw[0]).view(-1, 1, 1)
video_token_indices = video_token_indices.expand(-1, height, width).flatten()
video_token_indices = np.arange(curr_video_grid_thw[0]).reshape(-1, 1, 1)
video_token_indices = np.broadcast_to(
video_token_indices, (video_token_indices.shape[0], height, width)
).reshape(-1)
video_token_indices = (
video_token_indices * next(video_second_per_grid) * position_id_per_seconds
).long()
)
tokens_per_chunk = int(position_id_per_seconds * seconds_per_chunk)
video_chunk_indexes = self.get_chunked_index(video_token_indices, tokens_per_chunk)

View File

@ -436,6 +436,7 @@ class SmolVLMProcessor(ProcessorMixin):
fps: Optional[int] = None,
backend: str = "opencv",
skip_secs: int = 0.0,
**kwargs,
) -> np.array:
"""
Loads `video` to a numpy array.

View File

@ -1427,7 +1427,6 @@ class ProcessorMixin(PushToHubMixin):
# Fill sets of kwargs that should be used by different parts of template
processed_kwargs = {
"processor_kwargs": {},
"mm_load_kwargs": {},
"template_kwargs": {},
}
@ -1551,14 +1550,14 @@ class ProcessorMixin(PushToHubMixin):
# without actionable solution for users
single_prompt = prompt[0] if is_batched else prompt
if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
processed_kwargs["processor_kwargs"]["add_special_tokens"] = False
kwargs["add_special_tokens"] = False
out = self(
text=prompt,
images=batch_images if batch_images else None,
videos=batch_videos if batch_videos else None,
audio=batch_audios if batch_audios else None,
**processed_kwargs["processor_kwargs"],
**kwargs,
)
if return_dict:
return out
@ -1574,6 +1573,7 @@ class ProcessorMixin(PushToHubMixin):
num_frames: Optional[int] = None,
fps: Optional[int] = None,
backend: str = "opencv",
**kwargs,
) -> np.array:
"""
Loads `video` to a numpy array.

View File

@ -228,6 +228,10 @@ class GenerationTesterMixin:
"video_token_index",
"video_token_id",
"vision_start_token_id",
"audio_token_index",
"audio_start_token_id",
"audio_end_token_id",
"vision_end_token_id",
]:
token_index = getattr(config, key, None)
if token_index is None and hasattr(self, "model_tester"):