diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py index 1af14d021c7..8e7c44fcad5 100644 --- a/src/transformers/models/cohere2/modeling_cohere2.py +++ b/src/transformers/models/cohere2/modeling_cohere2.py @@ -660,7 +660,7 @@ class Cohere2Model(Cohere2PreTrainedModel): input_tensor: torch.Tensor, cache_position: torch.Tensor, past_key_values: HybridCache, - output_attentions: bool, + output_attentions: bool = False, ): # Flash Attention currently doesn't support static cache but Cohere2 work only with static cache. # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 353b171042f..fd63ec26c1c 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -673,7 +673,7 @@ class Gemma2Model(Gemma2PreTrainedModel): input_tensor: torch.Tensor, cache_position: torch.Tensor, past_key_values: HybridCache, - output_attentions: bool, + output_attentions: bool = False, ): # Flash Attention currently doesn't support static cache but Gemma2 work only with static cache. # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index b219384f34a..3d16f842ec6 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -540,7 +540,7 @@ class Gemma2Model(GemmaModel): input_tensor: torch.Tensor, cache_position: torch.Tensor, past_key_values: HybridCache, - output_attentions: bool, + output_attentions: bool = False, ): # Flash Attention currently doesn't support static cache but Gemma2 work only with static cache. # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index 316130ce9d1..951e8d78ca9 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -758,7 +758,7 @@ class Gemma3TextModel(Gemma3PreTrainedModel): input_tensor: torch.Tensor, cache_position: torch.Tensor, past_key_values: HybridCache, - output_attentions: bool, + output_attentions: bool = False, ): # Flash Attention currently doesn't support static cache but Gemma3Text work only with static cache. # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape