From 1d85c391408e5b9b36798184c55f42e990812c93 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 28 Jun 2025 13:49:51 -0700 Subject: [PATCH] update Signed-off-by: Roger Wang --- .../models/gemma3n/configuration_gemma3n.py | 6 +++--- src/transformers/models/gemma3n/modeling_gemma3n.py | 4 ++-- src/transformers/models/gemma3n/modular_gemma3n.py | 10 +++++----- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index ca1a0671774..18f4cf81932 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -575,7 +575,7 @@ class Gemma3nConfig(PretrainedConfig): Custom vision config or dict. audio_config (`Union[AutoConfig, dict]`, *optional*): Custom audio config or dict. - audio_soft_tokens_per_image (`int`, *optional*, defaults to 188): + audio_soft_tokens_per_audio (`int`, *optional*, defaults to 188): The number of soft tokens per audio clip. vision_soft_tokens_per_image (`int`, *optional*, defaults to 256): The number of soft tokens per image. @@ -631,7 +631,7 @@ class Gemma3nConfig(PretrainedConfig): text_config: Optional[Union[Gemma3nTextConfig, dict[str, Any]]] = None, vision_config: Optional[Union[Gemma3nVisionConfig, dict[str, Any]]] = None, audio_config: Optional[Union[Gemma3nAudioConfig, dict[str, Any]]] = None, - audio_soft_tokens_per_image: int = 188, + audio_soft_tokens_per_audio: int = 188, vision_soft_tokens_per_image: int = 256, boi_token_id: int = 255_999, eoi_token_id: int = 262_144, @@ -666,7 +666,7 @@ class Gemma3nConfig(PretrainedConfig): self.vision_config = vision_config self.audio_config = audio_config - self.audio_soft_tokens_per_image = audio_soft_tokens_per_image + self.audio_soft_tokens_per_audio = audio_soft_tokens_per_audio self.vision_soft_tokens_per_image = vision_soft_tokens_per_image self.boi_token_id = boi_token_id self.eoi_token_id = eoi_token_id diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 0817e16451a..7623e557980 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -937,7 +937,7 @@ class Gemma3nAudioEncoder(PreTrainedModel): Returns: audio_encodings: a torch.Tensor of shape - `[batch_size, self.config.audio_soft_tokens_per_image, + `[batch_size, self.config.audio_soft_tokens_per_audio, self.config.audio_config.hidden_size]` audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames]. """ @@ -2114,7 +2114,7 @@ class Gemma3nModel(Gemma3nPreTrainedModel): audio_features = torch.where(audio_mask.unsqueeze(-1), audio_padding_embs, audio_features) audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape - extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len + extra_padding_tokens = self.config.audio_soft_tokens_per_audio - audio_seq_len extra_padding_features = audio_padding_embs.expand(audio_batch_size, extra_padding_tokens, audio_embed_dim) audio_features = torch.cat((audio_features, extra_padding_features), dim=1) diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index a3ffa710d84..e038af27b17 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -538,7 +538,7 @@ class Gemma3nConfig(PretrainedConfig): Custom vision config or dict. audio_config (`Union[AutoConfig, dict]`, *optional*): Custom audio config or dict. - audio_soft_tokens_per_image (`int`, *optional*, defaults to 188): + audio_soft_tokens_per_audio (`int`, *optional*, defaults to 188): The number of soft tokens per audio clip. vision_soft_tokens_per_image (`int`, *optional*, defaults to 256): The number of soft tokens per image. @@ -594,7 +594,7 @@ class Gemma3nConfig(PretrainedConfig): text_config: Optional[Union[Gemma3nTextConfig, dict[str, Any]]] = None, vision_config: Optional[Union[Gemma3nVisionConfig, dict[str, Any]]] = None, audio_config: Optional[Union[Gemma3nAudioConfig, dict[str, Any]]] = None, - audio_soft_tokens_per_image: int = 188, + audio_soft_tokens_per_audio: int = 188, vision_soft_tokens_per_image: int = 256, boi_token_id: int = 255_999, eoi_token_id: int = 262_144, @@ -629,7 +629,7 @@ class Gemma3nConfig(PretrainedConfig): self.vision_config = vision_config self.audio_config = audio_config - self.audio_soft_tokens_per_image = audio_soft_tokens_per_image + self.audio_soft_tokens_per_audio = audio_soft_tokens_per_audio self.vision_soft_tokens_per_image = vision_soft_tokens_per_image self.boi_token_id = boi_token_id self.eoi_token_id = eoi_token_id @@ -1499,7 +1499,7 @@ class Gemma3nAudioEncoder(PreTrainedModel): Returns: audio_encodings: a torch.Tensor of shape - `[batch_size, self.config.audio_soft_tokens_per_image, + `[batch_size, self.config.audio_soft_tokens_per_audio, self.config.audio_config.hidden_size]` audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames]. """ @@ -2383,7 +2383,7 @@ class Gemma3nModel(PaliGemmaModel): audio_features = torch.where(audio_mask.unsqueeze(-1), audio_padding_embs, audio_features) audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape - extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len + extra_padding_tokens = self.config.audio_soft_tokens_per_audio - audio_seq_len extra_padding_features = audio_padding_embs.expand(audio_batch_size, extra_padding_tokens, audio_embed_dim) audio_features = torch.cat((audio_features, extra_padding_features), dim=1)