Never fallback to eager implicitly (#38327)
Some checks failed
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Waiting to run
Build documentation / build (push) Waiting to run
Slow tests on important models (on Push - A10) / Get all modified files (push) Waiting to run
Slow tests on important models (on Push - A10) / Slow & FA2 tests (push) Blocked by required conditions
Self-hosted runner (push-caller) / Check if setup was changed (push) Waiting to run
Self-hosted runner (push-caller) / build-docker-containers (push) Blocked by required conditions
Self-hosted runner (push-caller) / Trigger Push CI (push) Blocked by required conditions
Secret Leaks / trufflehog (push) Waiting to run
Update Transformers metadata / build_and_package (push) Waiting to run
New model PR merged notification / Notify new model (push) Has been cancelled

* remove arg everywhere

* Update warnings

* add more models

* Update sdpa_attention.py

* fix style

* fix

* readd warnings but not for flex

* Update test_modeling_common.py

* skip

* fix

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Cyril Vallez 2025-05-23 19:48:01 +02:00 committed by GitHub
parent e64ed0304c
commit e0aad278fe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
73 changed files with 66 additions and 544 deletions

View File

@ -243,13 +243,7 @@ class Olmo2Attention(OlmoAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -655,7 +655,6 @@ class GenerationMixin(ContinuousMixin):
# If it's not defined, it means the model uses the new general mask API
if causal_mask_creation_function is None: # can't be found
output_attentions = kwargs.get("output_attentions", False)
token_type_ids = getattr(model_input, "token_type_ids", None)
# Some models may overwrite the general one
causal_mask_creation_function = getattr(self, "create_masks_for_generate", create_masks_for_generate)
@ -666,7 +665,6 @@ class GenerationMixin(ContinuousMixin):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
token_type_ids=token_type_ids,
)
else:

View File

@ -235,10 +235,9 @@ def flex_attention_forward(
head_mask: Optional[torch.Tensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, torch.Tensor]:
if kwargs.get("output_attentions", False) or head_mask is not None:
if head_mask is not None:
logger.warning_once(
"`flex_attention` does not support `output_attentions=True` or `head_mask`."
" Please set your attention to `eager` if you want any of these features."
"`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature."
)
if kwargs.get("dropout", 0.0) > 0:

View File

@ -644,30 +644,12 @@ def _preprocess_mask_arguments(
return False, attention_mask, kv_length, kv_offset
def _get_mask_interface(config: PretrainedConfig, output_attentions: bool = False) -> Callable:
"""
Return the mask interface (a function) to be used, based on the type of attention found in the config.
Args:
config (`PretrainedConfig`):
The model config.
output_attentions (`bool`, optional):
Whether we return the attention scores or not. By default `False`.
"""
mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
# Sdpa fallbacks to eager in the Attention modules if `output_attentions=True`
if config._attn_implementation == "sdpa" and output_attentions:
mask_interface = ALL_MASK_ATTENTION_FUNCTIONS["eager"]
return mask_interface
def create_causal_mask(
config: PretrainedConfig,
input_embeds: torch.Tensor,
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
output_attentions: bool = False,
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
) -> Optional[Union[torch.Tensor, "BlockMask"]]:
@ -689,8 +671,6 @@ def create_causal_mask(
A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
past_key_values (`Cache`, optional):
The past key values, if we use a cache.
output_attentions (`bool`, optional):
Whether we return the attention scores or not. By default `False`.
or_mask_function (`Callable`, optional):
An optional mask function to combine with the causal mask function (by doing the union of both). This is
useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
@ -712,7 +692,7 @@ def create_causal_mask(
batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
mask_factory_function = causal_mask_function
mask_interface = _get_mask_interface(config, output_attentions)
mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
# Do not allow skip if we are compiling (this is to match BC)
# TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
@ -751,7 +731,6 @@ def create_sliding_window_causal_mask(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
output_attentions: bool = False,
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
) -> Optional[Union[torch.Tensor, "BlockMask"]]:
@ -774,8 +753,6 @@ def create_sliding_window_causal_mask(
A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
past_key_values (`Cache`, optional):
The past key values, if we use a cache.
output_attentions (`bool`, optional):
Whether we return the attention scores or not. By default `False`.
or_mask_function (`Callable`, optional):
An optional mask function to combine with the sliding causal mask function (by doing the union of both). This is
useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
@ -801,7 +778,7 @@ def create_sliding_window_causal_mask(
batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
mask_factory_function = sliding_window_causal_mask_function(sliding_window)
mask_interface = _get_mask_interface(config, output_attentions)
mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
# Do not allow skip if we are compiling (this is to match BC)
# TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
@ -841,7 +818,6 @@ def create_chunked_causal_mask(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
output_attentions: bool = False,
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
) -> Optional[Union[torch.Tensor, "BlockMask"]]:
@ -864,8 +840,6 @@ def create_chunked_causal_mask(
A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
past_key_values (`Cache`, optional):
The past key values, if we use a cache.
output_attentions (`bool`, optional):
Whether we return the attention scores or not. By default `False`.
or_mask_function (`Callable`, optional):
An optional mask function to combine with the chunked causal mask function (by doing the union of both). This is
useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling.
@ -898,7 +872,7 @@ def create_chunked_causal_mask(
batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
mask_factory_function = chunked_causal_mask_function(chunk_size)
mask_interface = _get_mask_interface(config, output_attentions)
mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
# Do not allow skip if we are compiling (this is to match BC)
# TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
@ -945,7 +919,6 @@ def create_masks_for_generate(
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
output_attentions: bool = False,
or_mask_function: Optional[Callable] = None,
and_mask_function: Optional[Callable] = None,
**kwargs,
@ -967,8 +940,6 @@ def create_masks_for_generate(
A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
past_key_values (`Cache`, optional):
The past key values, if we use a cache.
output_attentions (`bool`, optional):
Whether we return the attention scores or not. By default `False`.
or_mask_function (`Callable`, optional):
An optional mask function to combine with the other mask function (by doing the union of both). This is
useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
@ -985,7 +956,6 @@ def create_masks_for_generate(
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
"or_mask_function": or_mask_function,
"and_mask_function": and_mask_function,
}

View File

@ -805,7 +805,6 @@ class AriaTextModel(AriaTextPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -205,13 +205,7 @@ class BitNetAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -425,7 +419,6 @@ class BitNetModel(BitNetPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -85,13 +85,7 @@ class BitNetAttention(LlamaAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -261,13 +261,7 @@ class CohereAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -462,7 +456,6 @@ class CohereModel(CoherePreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -184,13 +184,7 @@ class CohereAttention(LlamaAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -222,13 +222,7 @@ class Cohere2Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -439,7 +433,6 @@ class Cohere2Model(Cohere2PreTrainedModel):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Create the masks
causal_mask_mapping = {

View File

@ -309,13 +309,7 @@ class Cohere2Attention(CohereAttention, nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -461,7 +455,6 @@ class Cohere2Model(Gemma2Model):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Create the masks
causal_mask_mapping = {

View File

@ -509,7 +509,6 @@ class CsmDepthDecoderModel(CsmPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds
@ -821,7 +820,6 @@ class CsmBackboneModel(CsmPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -247,7 +247,6 @@ class CsmDepthDecoderModel(LlamaModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -412,13 +412,7 @@ class DeepseekV3Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -608,7 +602,6 @@ class DeepseekV3Model(DeepseekV3PreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -293,13 +293,7 @@ class DeepseekV3Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -397,23 +397,6 @@ class DiffLlamaSdpaAttention(DiffLlamaAttention):
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"DiffLlamaModel is using DiffLlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
position_embeddings=position_embeddings,
)
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
@ -708,7 +691,6 @@ class DiffLlamaModel(DiffLlamaPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -330,23 +330,6 @@ class DiffLlamaSdpaAttention(DiffLlamaAttention):
cache_position: Optional[torch.LongTensor] = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"DiffLlamaModel is using DiffLlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
position_embeddings=position_embeddings,
)
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)

View File

@ -1272,7 +1272,6 @@ class Emu3TextModel(Emu3PreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -383,13 +383,7 @@ class FalconH1Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -1552,7 +1546,6 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
logits_to_keep (`int` or `torch.Tensor`, *optional*):
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that

View File

@ -251,13 +251,7 @@ class FalconH1Attention(LlamaAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -1307,7 +1301,6 @@ class FalconH1ForCausalLM(LlamaForCausalLM):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
logits_to_keep (`int` or `torch.Tensor`, *optional*):
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that

View File

@ -415,7 +415,6 @@ class GemmaModel(GemmaPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
# embed positions

View File

@ -416,7 +416,6 @@ class GemmaModel(LlamaModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
# embed positions

View File

@ -218,13 +218,7 @@ class Gemma2Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -445,7 +439,6 @@ class Gemma2Model(Gemma2PreTrainedModel):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Create the masks
causal_mask_mapping = {

View File

@ -283,13 +283,7 @@ class Gemma2Attention(GemmaAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -428,7 +422,6 @@ class Gemma2Model(GemmaModel):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Create the masks
causal_mask_mapping = {

View File

@ -345,14 +345,7 @@ class Gemma3Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. "
"Falling back to eager attention. This warning can be removed using the argument "
'`attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -566,7 +559,6 @@ class Gemma3TextModel(Gemma3PreTrainedModel):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Create the masks
causal_mask_mapping = {
@ -949,7 +941,6 @@ class Gemma3Model(Gemma3PreTrainedModel):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
if token_type_ids is not None and inputs_embeds.shape[1] != 1:
# We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
@ -1200,7 +1191,6 @@ class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
output_attentions: bool = False,
token_type_ids: Optional[torch.Tensor] = None,
**kwargs,
) -> dict:
@ -1211,7 +1201,6 @@ class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Add the token type ids mask for generate as well
if token_type_ids is not None and input_embeds.shape[1] != 1:

View File

@ -424,14 +424,7 @@ class Gemma3Attention(Gemma2Attention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. "
"Falling back to eager attention. This warning can be removed using the argument "
'`attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -617,7 +610,6 @@ class Gemma3TextModel(Gemma2Model):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Create the masks
causal_mask_mapping = {
@ -840,7 +832,6 @@ class Gemma3Model(PaliGemmaModel):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
if token_type_ids is not None and inputs_embeds.shape[1] != 1:
# We need to pass an additional mask function to account for token type ids, and it needs to be an `or`
@ -1050,7 +1041,6 @@ class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
attention_mask: Optional[torch.Tensor],
cache_position: torch.Tensor,
past_key_values: Optional[Cache],
output_attentions: bool = False,
token_type_ids: Optional[torch.Tensor] = None,
**kwargs,
) -> dict:
@ -1061,7 +1051,6 @@ class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Add the token type ids mask for generate as well
if token_type_ids is not None and input_embeds.shape[1] != 1:

View File

@ -436,7 +436,6 @@ class GlmModel(GlmPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -444,7 +444,6 @@ class Glm4Model(Glm4PreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -166,28 +166,9 @@ class GPTNeoXAttention(nn.Module):
}
key_states, value_states = layer_past.update(key_states, value_states, self.layer_idx, cache_kwargs)
# Checking for fallbacks in case an unsupported feature is requested
attention_type = self.config._attn_implementation
if (output_attentions or head_mask is not None) and self.config._attn_implementation in [
"sdpa",
"flash_attention_2",
]:
logger.warning_once(
f"Setting `attention_type` to `eager` because `{attention_type}` does not support"
f" `output_attentions=True` or `head_mask`."
)
attention_type = "eager"
elif self.training and self.attention_dropout > 0 and self.config._attn_implementation == "flex_attention":
logger.warning_once(
f"Setting `attention_type` to `eager` because `dropout` is not supported in `{attention_type}`."
)
attention_type = "eager"
attention_interface: Callable = eager_attention_forward
attention_interface = (
ALL_ATTENTION_FUNCTIONS[attention_type] if attention_type != "eager" else attention_interface
)
if self.config._attn_implementation != "eager":
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
# Compute attention
attn_output, attn_weights = attention_interface(
@ -409,7 +390,6 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
# Prepare head mask if needed

View File

@ -153,28 +153,9 @@ class GPTNeoXAttention(nn.Module):
}
key_states, value_states = layer_past.update(key_states, value_states, self.layer_idx, cache_kwargs)
# Checking for fallbacks in case an unsupported feature is requested
attention_type = self.config._attn_implementation
if (output_attentions or head_mask is not None) and self.config._attn_implementation in [
"sdpa",
"flash_attention_2",
]:
logger.warning_once(
f"Setting `attention_type` to `eager` because `{attention_type}` does not support"
f" `output_attentions=True` or `head_mask`."
)
attention_type = "eager"
elif self.training and self.attention_dropout > 0 and self.config._attn_implementation == "flex_attention":
logger.warning_once(
f"Setting `attention_type` to `eager` because `dropout` is not supported in `{attention_type}`."
)
attention_type = "eager"
attention_interface: Callable = eager_attention_forward
attention_interface = (
ALL_ATTENTION_FUNCTIONS[attention_type] if attention_type != "eager" else attention_interface
)
if self.config._attn_implementation != "eager":
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
# Compute attention
attn_output, attn_weights = attention_interface(
@ -356,7 +337,6 @@ class GPTNeoXModel(LlamaModel, nn.Module):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
# Prepare head mask if needed

View File

@ -439,7 +439,6 @@ class GraniteModel(GranitePreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -181,7 +181,6 @@ class GraniteModel(LlamaModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -452,13 +452,7 @@ class GraniteMoeAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -203,13 +203,7 @@ class GraniteMoeHybridAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -387,13 +387,7 @@ class GraniteMoeSharedAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -421,7 +421,6 @@ class HeliumModel(HeliumPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -40,16 +40,12 @@ from ...utils import (
auto_docstring,
can_return_tuple,
is_torchdynamo_compiling,
logging,
torch_int,
)
from ..auto import AutoModel
from .configuration_internvl import InternVLConfig, InternVLVisionConfig
logger = logging.get_logger(__name__)
@use_kernel_forward_from_hub("RMSNorm")
class InternVLVisionRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
@ -151,13 +147,7 @@ class InternVLVisionAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -108,13 +108,7 @@ class InternVLVisionAttention(JanusVisionAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -358,13 +358,7 @@ class JanusVisionAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -506,13 +506,7 @@ class JanusVisionAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -425,7 +425,6 @@ class LlamaModel(LlamaPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -356,13 +356,7 @@ class Llama4TextAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
query_states,
@ -570,7 +564,6 @@ class Llama4TextModel(Llama4PreTrainedModel):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Create the masks
causal_mask_mapping = {
@ -916,13 +909,7 @@ class Llama4VisionAttention(nn.Module):
attention_interface: Callable = vision_eager_attention_forward
# flex disable because breaks on TP 8, embed is 88 not power of 2
if self.config._attn_implementation not in ["eager", "flex_attention"]:
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -164,13 +164,7 @@ class MistralAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -404,7 +398,6 @@ class MistralModel(MistralPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -75,13 +75,7 @@ class MistralAttention(LlamaAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -168,7 +162,6 @@ class MistralModel(LlamaModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -276,13 +276,7 @@ class MixtralAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -525,7 +519,6 @@ class MixtralModel(MixtralPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -370,7 +370,6 @@ class MixtralModel(MistralModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -28,13 +28,10 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import auto_docstring, can_return_tuple, logging, torch_int
from ...utils import auto_docstring, can_return_tuple, torch_int
from .configuration_mlcd import MLCDVisionConfig
logger = logging.get_logger(__name__)
class MLCDMLP(nn.Module):
def __init__(self, config):
super().__init__()
@ -281,13 +278,7 @@ class MLCDAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -226,13 +226,7 @@ class MLCDAttention(CLIPAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -265,13 +265,7 @@ class MoonshineAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
@ -749,7 +743,6 @@ class MoonshineDecoder(MoonshinePreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -361,13 +361,7 @@ class MoonshineAttention(GlmAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
@ -755,7 +749,6 @@ class MoonshineDecoder(LlamaModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -191,13 +191,7 @@ class OlmoAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -406,7 +400,6 @@ class OlmoModel(OlmoPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -111,13 +111,7 @@ class OlmoAttention(LlamaAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -179,13 +179,7 @@ class Olmo2Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -412,7 +406,6 @@ class Olmo2Model(Olmo2PreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -225,13 +225,7 @@ class Olmo2Attention(OlmoAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -173,13 +173,7 @@ class PhiAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -400,7 +394,6 @@ class PhiModel(PhiPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
inputs_embeds = self.embed_dropout(inputs_embeds) # diff with Llama

View File

@ -96,13 +96,7 @@ class PhiAttention(LlamaAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -251,7 +245,6 @@ class PhiModel(LlamaModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
inputs_embeds = self.embed_dropout(inputs_embeds) # diff with Llama

View File

@ -193,13 +193,7 @@ class Phi3Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -459,7 +453,6 @@ class Phi3Model(Phi3PreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -145,13 +145,7 @@ class Phi3Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -1427,13 +1427,7 @@ class Phi4MultimodalAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -1766,7 +1760,6 @@ class Phi4MultimodalModel(Phi4MultimodalPreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -1577,7 +1577,6 @@ class Phi4MultimodalModel(Phi3Model, nn.Module):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -165,13 +165,7 @@ class Qwen2Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -411,7 +405,6 @@ class Qwen2Model(Qwen2PreTrainedModel):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Create the masks
causal_mask_mapping = {

View File

@ -75,13 +75,7 @@ class Qwen2Attention(LlamaAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -173,7 +167,6 @@ class Qwen2Model(MistralModel):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Create the masks
causal_mask_mapping = {

View File

@ -212,13 +212,7 @@ class Qwen3Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -437,7 +431,6 @@ class Qwen3Model(Qwen3PreTrainedModel):
"attention_mask": attention_mask,
"cache_position": cache_position,
"past_key_values": past_key_values,
"output_attentions": output_attentions,
}
# Create the masks
causal_mask_mapping = {

View File

@ -89,13 +89,7 @@ class Qwen3Attention(LlamaAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -177,13 +177,7 @@ class Qwen3MoeAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -530,7 +524,6 @@ class Qwen3MoeModel(Qwen3MoePreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -184,13 +184,7 @@ class Starcoder2Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -405,7 +399,6 @@ class Starcoder2Model(Starcoder2PreTrainedModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -103,13 +103,7 @@ class Starcoder2Attention(MistralAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
@ -220,7 +214,6 @@ class Starcoder2Model(MistralModel):
attention_mask=attention_mask,
cache_position=cache_position,
past_key_values=past_key_values,
output_attentions=output_attentions,
)
hidden_states = inputs_embeds

View File

@ -246,13 +246,7 @@ class TimesFmAttention(nn.Module):
attention_interface: Callable = simple_eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -202,13 +202,7 @@ class TimesFmAttention(nn.Module):
attention_interface: Callable = simple_eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -269,13 +269,7 @@ class ZambaAttention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -53,7 +53,6 @@ if is_causal_conv1d_available():
else:
causal_conv1d_update, causal_conv1d_fn = None, None
logger = logging.get_logger(__name__)
@ -435,13 +434,7 @@ class Zamba2Attention(nn.Module):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -270,13 +270,7 @@ class Zamba2Attention(ZambaAttention):
attention_interface: Callable = eager_attention_forward
if self.config._attn_implementation != "eager":
if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
logger.warning_once(
"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
else:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,

View File

@ -4353,7 +4353,8 @@ class ModelTesterMixin:
if hasattr(config, "layer_types"):
del config_dict["layer_types"]
new_config = config.__class__(**config_dict)
model = model_class(new_config).to(torch_device)
# We need to set eager as otherwise `output_attentions` is not supported
model = model_class._from_config(new_config, attn_implementation="eager").to(torch_device)
model.eval()
layer_types = getattr(model.config, "layer_types", ["sliding_attention"] * config.num_hidden_layers)
attentions = model(**inputs, output_attentions=True).attentions
@ -4370,7 +4371,8 @@ class ModelTesterMixin:
if hasattr(config, "layer_types"):
del config_dict["layer_types"]
new_config = config.__class__(**config_dict)
model = model_class(new_config).to(torch_device)
# We need to set eager as otherwise `output_attentions` is not supported
model = model_class._from_config(new_config, attn_implementation="eager").to(torch_device)
model.eval()
attentions_not_sliding = model(**inputs, output_attentions=True).attentions
for layer_attention in attentions_not_sliding: