Don't use default attn if pre-set in sub-config (#38526)

* don't use default attn if pre-set in sib-config * style * add a test maybe
2025-07-03 04:40:06 +06:00 · 2025-06-03 09:53:07 +02:00 · 2025-06-03 09:53:07 +02:00 · 55ec319de6
commit 55ec319de6
parent bf68dd9e6e
2 changed files with 14 additions and 2 deletions
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -2277,8 +2277,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
                if not isinstance(requested_attn_implementation, dict)
                else requested_attn_implementation.get(key, None)
            )
-            # For models with backbone sub-config might be not initialized
-            if sub_config is not None:
+            # For models with backbone sub-config might be not initialized. Set the requested att
+            # if the config hasn't got any attn pre-set and the requested attn in not `None` (i.e not the default attn)
+            if (
+                sub_config is not None
+                and sub_config._attn_implementation_internal is None
+                and curr_attn_implementation is not None
+            ):
                sub_config._attn_implementation_internal = curr_attn_implementation

        if config._attn_implementation == "flash_attention_2":
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@ -3425,6 +3425,13 @@ class ModelTesterMixin:
                        f"The eager model should not have SDPA/FA2 attention layers but got `{class_name}.config._attn_implementation={submodule.config._attn_implementation}`"
                    )

+            # Set the attention to default `None` but the text config to `eager`
+            # The model should load encoders in SDPA but not the text attention
+            config._attn_implementation = None
+            config.get_text_config(decoder=True)._attn_implementation = "eager"
+            model = model_class(config)
+            self.assertTrue(model.config.get_text_config(decoder=True)._attn_implementation == "eager")
+
    @require_torch_sdpa
    def test_sdpa_can_dispatch_non_composite_models(self):
        """