From 0d69fa6dcd9886a380c6dfd9daa89c795e7a6f9b Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Thu, 5 Jun 2025 10:11:58 +0200 Subject: [PATCH] [qwen-omni] fix sliding window (#38525) fix --- .../qwen2_5_omni/configuration_qwen2_5_omni.py | 15 ++++++++++++++- .../models/qwen2_5_omni/modular_qwen2_5_omni.py | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index c790cceefc9..3f76da5e3eb 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -658,6 +658,8 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. spatial_merge_size (`int`, *optional*, defaults to 2): The size used for merging spatial dimensions. + layer_types (`list`, *optional*): + Attention pattern for each layer. Example: @@ -726,6 +728,7 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig): audio_end_token_id=151648, initializer_range=0.02, spatial_merge_size=2, + layer_types=None, **kwargs, ): self.audio_token_index = audio_token_index @@ -753,7 +756,7 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig): self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window + self.sliding_window = sliding_window if self.use_sliding_window else None self.max_window_layers = max_window_layers # for backward compatibility @@ -775,6 +778,16 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig): self.initializer_range = initializer_range self.spatial_merge_size = spatial_merge_size + self.layer_types = layer_types + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" + if self.sliding_window is not None and i >= self.max_window_layers + else "full_attention" + for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types) + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 6f40803f803..46bd0f23209 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -697,6 +697,8 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. spatial_merge_size (`int`, *optional*, defaults to 2): The size used for merging spatial dimensions. + layer_types (`list`, *optional*): + Attention pattern for each layer. Example: @@ -765,6 +767,7 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig): audio_end_token_id=151648, initializer_range=0.02, spatial_merge_size=2, + layer_types=None, **kwargs, ): self.audio_token_index = audio_token_index @@ -792,7 +795,7 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig): self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window + self.sliding_window = sliding_window if self.use_sliding_window else None self.max_window_layers = max_window_layers # for backward compatibility @@ -814,6 +817,16 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig): self.initializer_range = initializer_range self.spatial_merge_size = spatial_merge_size + self.layer_types = layer_types + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" + if self.sliding_window is not None and i >= self.max_window_layers + else "full_attention" + for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types) + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)