From f5d45d89c4f1d3282fc216a96dc6830b88d41249 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Fri, 23 May 2025 17:17:38 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A8Early-error=F0=9F=9A=A8=20config=20?= =?UTF-8?q?will=20error=20out=20if=20`output=5Fattentions=3DTrue`=20and=20?= =?UTF-8?q?the=20attn=20implementation=20is=20wrong=20(#38288)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Protect ParallelInterface * early error out on output attention setting for no wraning in modeling * modular update * fixup * update model tests * update * oups * set model's config * more cases * ?? * properly fix * fixup * update * last onces * update * fix? * fix wrong merge commit * fix hub test * nits * wow I am tired * updates * fix pipeline! --------- Co-authored-by: Lysandre --- docs/source/en/model_doc/jamba.md | 2 +- src/transformers/configuration_utils.py | 20 ++++++++++++++++++- src/transformers/models/aria/modeling_aria.py | 9 +-------- .../models/bamba/modeling_bamba.py | 9 +-------- src/transformers/models/csm/modeling_csm.py | 9 +-------- src/transformers/models/emu3/modeling_emu3.py | 9 +-------- .../models/gemma/modeling_gemma.py | 9 +-------- src/transformers/models/glm/modeling_glm.py | 9 +-------- src/transformers/models/glm4/modeling_glm4.py | 9 +-------- .../models/granite/modeling_granite.py | 9 +-------- .../models/helium/modeling_helium.py | 9 +-------- .../models/llama/modeling_llama.py | 9 +-------- tests/generation/test_utils.py | 7 +++++-- .../autoformer/test_modeling_autoformer.py | 3 ++- tests/models/bamba/test_modeling_bamba.py | 3 ++- tests/models/canine/test_modeling_canine.py | 3 ++- .../test_modeling_conditional_detr.py | 3 ++- .../models/convbert/test_modeling_convbert.py | 3 ++- tests/models/d_fine/test_modeling_d_fine.py | 3 ++- .../models/dab_detr/test_modeling_dab_detr.py | 3 ++- .../test_modeling_deformable_detr.py | 3 ++- tests/models/detr/test_modeling_detr.py | 3 ++- .../models/donut/test_modeling_donut_swin.py | 3 ++- .../test_modeling_encoder_decoder.py | 1 + .../falcon_h1/test_modeling_falcon_h1.py | 3 ++- .../test_modeling_fastspeech2_conformer.py | 6 ++++-- tests/models/flava/test_modeling_flava.py | 3 ++- tests/models/glpn/test_modeling_glpn.py | 3 ++- .../test_modeling_grounding_dino.py | 3 ++- .../models/groupvit/test_modeling_groupvit.py | 3 ++- tests/models/hiera/test_modeling_hiera.py | 3 ++- tests/models/idefics/test_modeling_idefics.py | 3 ++- .../models/informer/test_modeling_informer.py | 3 ++- tests/models/jamba/test_modeling_jamba.py | 3 ++- .../layoutlmv2/test_modeling_layoutlmv2.py | 3 ++- tests/models/led/test_modeling_led.py | 3 ++- tests/models/luke/test_modeling_luke.py | 3 ++- .../maskformer/test_modeling_maskformer.py | 3 ++- .../moonshine/test_modeling_moonshine.py | 3 ++- .../omdet_turbo/test_modeling_omdet_turbo.py | 3 ++- .../paligemma/test_modeling_paligemma.py | 3 ++- .../pegasus_x/test_modeling_pegasus_x.py | 3 ++- .../perceiver/test_modeling_perceiver.py | 3 ++- tests/models/rt_detr/test_modeling_rt_detr.py | 3 ++- .../rt_detr_v2/test_modeling_rt_detr_v2.py | 3 ++- tests/models/rwkv/test_modeling_rwkv.py | 3 ++- tests/models/sam/test_modeling_sam.py | 6 ++++-- tests/models/sam/test_modeling_tf_sam.py | 3 ++- tests/models/sam_hq/test_modeling_sam_hq.py | 6 ++++-- .../test_modeling_seamless_m4t.py | 3 ++- .../test_modeling_seamless_m4t_v2.py | 3 ++- .../segformer/test_modeling_segformer.py | 3 ++- .../test_modeling_speech_to_text.py | 3 ++- .../test_modeling_tf_speech_to_text.py | 3 ++- .../models/speecht5/test_modeling_speecht5.py | 6 ++++-- tests/models/swin/test_modeling_swin.py | 3 ++- tests/models/swin2sr/test_modeling_swin2sr.py | 3 ++- tests/models/swinv2/test_modeling_swinv2.py | 3 ++- .../test_modeling_table_transformer.py | 3 ++- .../test_modeling_time_series_transformer.py | 3 ++- tests/models/vilt/test_modeling_vilt.py | 3 ++- .../visual_bert/test_modeling_visual_bert.py | 3 ++- tests/models/vivit/test_modeling_vivit.py | 3 ++- tests/models/whisper/test_modeling_whisper.py | 3 ++- tests/models/x_clip/test_modeling_x_clip.py | 3 ++- tests/models/yolos/test_modeling_yolos.py | 3 ++- tests/models/zamba/test_modeling_zamba.py | 3 ++- tests/models/zamba2/test_modeling_zamba2.py | 3 ++- .../pipelines/test_pipelines_text_to_audio.py | 1 + tests/test_modeling_common.py | 5 +++-- tests/utils/test_configuration_utils.py | 1 + 71 files changed, 157 insertions(+), 144 deletions(-) diff --git a/docs/source/en/model_doc/jamba.md b/docs/source/en/model_doc/jamba.md index a096f238418..5dad796f260 100644 --- a/docs/source/en/model_doc/jamba.md +++ b/docs/source/en/model_doc/jamba.md @@ -99,7 +99,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True, device_map = {'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 1, 'model.layers.10': 1, 'model.layers.11': 1, 'model.layers.12': 1, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 2, 'model.layers.19': 2, 'model.layers.20': 2, 'model.layers.21': 2, 'model.layers.22': 2, 'model.layers.23': 2, 'model.layers.24': 2, 'model.layers.25': 2, 'model.layers.26': 2, 'model.layers.27': 3, 'model.layers.28': 3, 'model.layers.29': 3, 'model.layers.30': 3, 'model.layers.31': 3, 'model.layers.32': 3, 'model.layers.33': 3, 'model.layers.34': 3, 'model.layers.35': 3, 'model.layers.36': 4, 'model.layers.37': 4, 'model.layers.38': 4, 'model.layers.39': 4, 'model.layers.40': 4, 'model.layers.41': 4, 'model.layers.42': 4, 'model.layers.43': 4, 'model.layers.44': 4, 'model.layers.45': 5, 'model.layers.46': 5, 'model.layers.47': 5, 'model.layers.48': 5, 'model.layers.49': 5, 'model.layers.50': 5, 'model.layers.51': 5, 'model.layers.52': 5, 'model.layers.53': 5, 'model.layers.54': 6, 'model.layers.55': 6, 'model.layers.56': 6, 'model.layers.57': 6, 'model.layers.58': 6, 'model.layers.59': 6, 'model.layers.60': 6, 'model.layers.61': 6, 'model.layers.62': 6, 'model.layers.63': 7, 'model.layers.64': 7, 'model.layers.65': 7, 'model.layers.66': 7, 'model.layers.67': 7, 'model.layers.68': 7, 'model.layers.69': 7, 'model.layers.70': 7, 'model.layers.71': 7, 'model.final_layernorm': 7, 'lm_head': 7} model = AutoModelForCausalLM.from_pretrained("ai21labs/AI21-Jamba-Large-1.6", torch_dtype=torch.bfloat16, - attn_implementation="flash_attention_2", + attn_implementation="flash_attention_2", quantization_config=quantization_config, device_map=device_map) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 6e75fbfb54a..28e3b41f7e1 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -214,7 +214,7 @@ class PretrainedConfig(PushToHubMixin): # Attributes with defaults self.return_dict = kwargs.pop("return_dict", True) self.output_hidden_states = kwargs.pop("output_hidden_states", False) - self.output_attentions = kwargs.pop("output_attentions", False) + self._output_attentions = kwargs.pop("output_attentions", False) self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models self.torch_dtype = kwargs.pop("torch_dtype", None) # Only used by PyTorch models self.use_bfloat16 = kwargs.pop("use_bfloat16", False) @@ -331,6 +331,22 @@ class PretrainedConfig(PushToHubMixin): def name_or_path(self, value): self._name_or_path = str(value) # Make sure that name_or_path is a string (for JSON encoding) + @property + def output_attentions(self): + """ + `bool`: Whether or not the model should returns all attentions. + """ + return self._output_attentions + + @output_attentions.setter + def output_attentions(self, value): + if self._attn_implementation != "eager": + raise ValueError( + "The `output_attentions` attribute is not supported when using the `attn_implementation` set to " + f"{self._attn_implementation}. Please set it to 'eager' instead." + ) + self._output_attentions = value + @property def use_return_dict(self) -> bool: """ @@ -1004,6 +1020,8 @@ class PretrainedConfig(PushToHubMixin): if "_auto_class" in d: del d["_auto_class"] + if "_output_attentions" in d: + d["output_attentions"] = d.pop("_output_attentions") if "_commit_hash" in d: del d["_commit_hash"] if "_attn_implementation_internal" in d: diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index cd794846275..8f80a7ff08b 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -549,15 +549,8 @@ class AriaTextAttention(nn.Module): key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py index 11742b1a321..918782826dd 100644 --- a/src/transformers/models/bamba/modeling_bamba.py +++ b/src/transformers/models/bamba/modeling_bamba.py @@ -313,15 +313,8 @@ class BambaAttention(nn.Module): key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, diff --git a/src/transformers/models/csm/modeling_csm.py b/src/transformers/models/csm/modeling_csm.py index 6f8fd7a487f..34abf4f15d7 100644 --- a/src/transformers/models/csm/modeling_csm.py +++ b/src/transformers/models/csm/modeling_csm.py @@ -337,15 +337,8 @@ class CsmAttention(nn.Module): key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 6e7b2e419a0..cf7004c97f7 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -206,15 +206,8 @@ class Emu3Attention(nn.Module): key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 897f329e56c..b7990e9660c 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -239,15 +239,8 @@ class GemmaAttention(nn.Module): key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index f3ac600e22b..236b0ed5c4c 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -201,15 +201,8 @@ class GlmAttention(nn.Module): key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py index 4525ba15018..48d8522502e 100644 --- a/src/transformers/models/glm4/modeling_glm4.py +++ b/src/transformers/models/glm4/modeling_glm4.py @@ -259,15 +259,8 @@ class Glm4Attention(nn.Module): key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index fdba3f4c0eb..639db777012 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -165,15 +165,8 @@ class GraniteAttention(nn.Module): key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py index 5d58ca59458..eea73341d05 100644 --- a/src/transformers/models/helium/modeling_helium.py +++ b/src/transformers/models/helium/modeling_helium.py @@ -241,15 +241,8 @@ class HeliumAttention(nn.Module): key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 1718c587d94..7b0416ec19f 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -245,15 +245,8 @@ class LlamaAttention(nn.Module): key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 57037ee435c..617ba23ebd6 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1157,7 +1157,8 @@ class GenerationTesterMixin: self.skipTest(reason=f"{model_class.__name__} doesn't support caching") config.is_decoder = True - model = model_class(config).to(torch_device).eval() + model = model_class._from_config(config, attn_implementation="eager").to(torch_device).eval() + config = model.config # Sets assisted generation arguments such that: # a) no EOS is generated, to ensure generation doesn't break early # b) the assistant model always generates two tokens when it is called, to ensure the input preparation of @@ -1187,6 +1188,7 @@ class GenerationTesterMixin: assistant_model = model_class(config).to(torch_device).eval() else: assistant_model = model + assistant_model.config._attn_implementation = "eager" assistant_model.generation_config.num_assistant_tokens = 2 # see b) assistant_model.generation_config.num_assistant_tokens_schedule = "constant" # see b) generation_kwargs.update({"assistant_model": assistant_model}) @@ -1367,7 +1369,8 @@ class GenerationTesterMixin: self.skipTest(reason=f"{model_class.__name__} doesn't support caching") config.is_decoder = True - model = model_class(config).to(torch_device).eval() + model = model_class._from_config(config, attn_implementation="eager").to(torch_device).eval() + config = model.config # Sets assisted generation arguments such that: # a) no EOS is generated, to ensure generation doesn't break early # b) the assistant model always generates two tokens when it is called, to ensure the input preparation of diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py index 3c1435f33ee..954f9f16622 100644 --- a/tests/models/autoformer/test_modeling_autoformer.py +++ b/tests/models/autoformer/test_modeling_autoformer.py @@ -323,7 +323,8 @@ class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py index bacbfa0aead..7c00a7a030d 100644 --- a/tests/models/bamba/test_modeling_bamba.py +++ b/tests/models/bamba/test_modeling_bamba.py @@ -362,7 +362,8 @@ class BambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py index 94becd11af6..4043a8ea908 100644 --- a/tests/models/canine/test_modeling_canine.py +++ b/tests/models/canine/test_modeling_canine.py @@ -318,7 +318,8 @@ class CanineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py index c13079c5fd5..f752e58c6af 100644 --- a/tests/models/conditional_detr/test_modeling_conditional_detr.py +++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py @@ -279,7 +279,8 @@ class ConditionalDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py index 8aba631c307..908c0920389 100644 --- a/tests/models/convbert/test_modeling_convbert.py +++ b/tests/models/convbert/test_modeling_convbert.py @@ -327,7 +327,8 @@ class ConvBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/d_fine/test_modeling_d_fine.py b/tests/models/d_fine/test_modeling_d_fine.py index cbbe821c792..741fbd0ca54 100644 --- a/tests/models/d_fine/test_modeling_d_fine.py +++ b/tests/models/d_fine/test_modeling_d_fine.py @@ -371,7 +371,8 @@ class DFineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/dab_detr/test_modeling_dab_detr.py b/tests/models/dab_detr/test_modeling_dab_detr.py index e040a2c6624..8b4d8c139dc 100644 --- a/tests/models/dab_detr/test_modeling_dab_detr.py +++ b/tests/models/dab_detr/test_modeling_dab_detr.py @@ -499,7 +499,8 @@ class DabDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index 6274a7e1efb..d069a711bf9 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -278,7 +278,8 @@ class DeformableDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py index 6d31cdc65db..b626f74c5c5 100644 --- a/tests/models/detr/test_modeling_detr.py +++ b/tests/models/detr/test_modeling_detr.py @@ -279,7 +279,8 @@ class DetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py index 078331389b8..456da850079 100644 --- a/tests/models/donut/test_modeling_donut_swin.py +++ b/tests/models/donut/test_modeling_donut_swin.py @@ -214,7 +214,8 @@ class DonutSwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py index 9aceea0359b..e9eddf86567 100644 --- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py @@ -457,6 +457,7 @@ class EncoderDecoderMixin: decoder_attention_mask = decoder_attention_mask[:, :-1] encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) + enc_dec_model.config._attn_implementation = "eager" # model config -> won't work enc_dec_model.config.output_attentions = True # model config -> won't work enc_dec_model.to(torch_device) outputs_encoder_decoder = enc_dec_model( diff --git a/tests/models/falcon_h1/test_modeling_falcon_h1.py b/tests/models/falcon_h1/test_modeling_falcon_h1.py index f627fa5f634..e8432c477ce 100644 --- a/tests/models/falcon_h1/test_modeling_falcon_h1.py +++ b/tests/models/falcon_h1/test_modeling_falcon_h1.py @@ -337,7 +337,8 @@ class FalconH1ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py index 0e0562da92d..22201f42b0e 100644 --- a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py +++ b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py @@ -287,7 +287,8 @@ class FastSpeech2ConformerModelTest(ModelTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): @@ -709,7 +710,8 @@ class FastSpeech2ConformerWithHifiGanTest(ModelTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.model_config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index 322fa9d68f4..f5cafdc9578 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -218,7 +218,8 @@ class FlavaImageModelTest(ModelTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py index 94f357455b7..b3e1852373a 100644 --- a/tests/models/glpn/test_modeling_glpn.py +++ b/tests/models/glpn/test_modeling_glpn.py @@ -184,7 +184,8 @@ class GLPNModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py index b55c590efb6..80023aa5b79 100644 --- a/tests/models/grounding_dino/test_modeling_grounding_dino.py +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -327,7 +327,8 @@ class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Tes inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index 6226f6ff0f6..24e4328ac7e 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -202,7 +202,8 @@ class GroupViTVisionModelTest(ModelTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/hiera/test_modeling_hiera.py b/tests/models/hiera/test_modeling_hiera.py index 7874c78d829..dfbec4a4b8a 100644 --- a/tests/models/hiera/test_modeling_hiera.py +++ b/tests/models/hiera/test_modeling_hiera.py @@ -281,7 +281,8 @@ class HieraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 5f6a0f1832c..c76ece0d959 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -502,7 +502,8 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py index 92fddb49989..22e6217c72c 100644 --- a/tests/models/informer/test_modeling_informer.py +++ b/tests/models/informer/test_modeling_informer.py @@ -384,7 +384,8 @@ class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py index 67f9b30c855..993db32378b 100644 --- a/tests/models/jamba/test_modeling_jamba.py +++ b/tests/models/jamba/test_modeling_jamba.py @@ -452,7 +452,8 @@ class JambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index accff4075b7..00cf7e59b6e 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -334,7 +334,8 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/led/test_modeling_led.py b/tests/models/led/test_modeling_led.py index a80543ef258..60a10eb860b 100644 --- a/tests/models/led/test_modeling_led.py +++ b/tests/models/led/test_modeling_led.py @@ -405,7 +405,8 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py index d255a607c57..dd51475540c 100644 --- a/tests/models/luke/test_modeling_luke.py +++ b/tests/models/luke/test_modeling_luke.py @@ -758,7 +758,8 @@ class LukeModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py index faf074902dc..2f30d4dc3c6 100644 --- a/tests/models/maskformer/test_modeling_maskformer.py +++ b/tests/models/maskformer/test_modeling_maskformer.py @@ -299,7 +299,8 @@ class MaskFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/moonshine/test_modeling_moonshine.py b/tests/models/moonshine/test_modeling_moonshine.py index abc45a491f0..6001f2058d9 100644 --- a/tests/models/moonshine/test_modeling_moonshine.py +++ b/tests/models/moonshine/test_modeling_moonshine.py @@ -167,7 +167,8 @@ class MoonshineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py index 457572fbbea..291e2709bbb 100644 --- a/tests/models/omdet_turbo/test_modeling_omdet_turbo.py +++ b/tests/models/omdet_turbo/test_modeling_omdet_turbo.py @@ -462,7 +462,8 @@ class OmDetTurboModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index a4d323baa31..388661ac1fd 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -355,7 +355,8 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/models/pegasus_x/test_modeling_pegasus_x.py b/tests/models/pegasus_x/test_modeling_pegasus_x.py index a6bf913e4c2..20cdac98fbe 100644 --- a/tests/models/pegasus_x/test_modeling_pegasus_x.py +++ b/tests/models/pegasus_x/test_modeling_pegasus_x.py @@ -304,7 +304,8 @@ class PegasusXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py index 34f134f3f49..fddf1db71a3 100644 --- a/tests/models/perceiver/test_modeling_perceiver.py +++ b/tests/models/perceiver/test_modeling_perceiver.py @@ -456,7 +456,8 @@ class PerceiverModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/rt_detr/test_modeling_rt_detr.py b/tests/models/rt_detr/test_modeling_rt_detr.py index 5dedeaceaec..fa2938160d7 100644 --- a/tests/models/rt_detr/test_modeling_rt_detr.py +++ b/tests/models/rt_detr/test_modeling_rt_detr.py @@ -335,7 +335,8 @@ class RTDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py index e9874f7c515..a78f11ea46c 100644 --- a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py +++ b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py @@ -339,7 +339,8 @@ class RTDetrV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py index aad32b1be8e..4e92baf2d3d 100644 --- a/tests/models/rwkv/test_modeling_rwkv.py +++ b/tests/models/rwkv/test_modeling_rwkv.py @@ -312,7 +312,8 @@ class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py index cce221a2192..978323413ca 100644 --- a/tests/models/sam/test_modeling_sam.py +++ b/tests/models/sam/test_modeling_sam.py @@ -202,7 +202,8 @@ class SamVisionModelTest(ModelTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): @@ -590,7 +591,8 @@ class SamModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py index 01c3ca0588a..6b4cd75467f 100644 --- a/tests/models/sam/test_modeling_tf_sam.py +++ b/tests/models/sam/test_modeling_tf_sam.py @@ -562,7 +562,8 @@ class TFSamModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase) inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config outputs = model(**self._prepare_for_class(inputs_dict, model_class)) vision_attentions = outputs.vision_attentions diff --git a/tests/models/sam_hq/test_modeling_sam_hq.py b/tests/models/sam_hq/test_modeling_sam_hq.py index 502da82a277..915ce022fc0 100644 --- a/tests/models/sam_hq/test_modeling_sam_hq.py +++ b/tests/models/sam_hq/test_modeling_sam_hq.py @@ -210,7 +210,8 @@ class SamHQVisionModelTest(ModelTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): @@ -637,7 +638,8 @@ class SamHQModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py index 938bdcf619a..e802e8cfb92 100644 --- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py @@ -475,7 +475,8 @@ class SeamlessM4TModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py index 6dc61db6f6e..75ff7edccbd 100644 --- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py +++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py @@ -491,7 +491,8 @@ class SeamlessM4Tv2ModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase) inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py index 71aa8b1386d..cd75545c62a 100644 --- a/tests/models/segformer/test_modeling_segformer.py +++ b/tests/models/segformer/test_modeling_segformer.py @@ -216,7 +216,8 @@ class SegformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py index 922ca66e089..593aca14066 100644 --- a/tests/models/speech_to_text/test_modeling_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py @@ -434,7 +434,8 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py index 6073afc2bde..613081a82e0 100644 --- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py @@ -323,7 +323,8 @@ class TFSpeech2TextModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.T inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config subsampled_encoder_seq_length = model._get_feat_extract_output_lengths(encoder_seq_length) subsampled_encoder_key_length = model._get_feat_extract_output_lengths(encoder_key_length) diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index 41d4e82c535..5fc1d670666 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -415,7 +415,8 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase, Generatio inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() @@ -1524,7 +1525,8 @@ class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py index a211fa97db1..a8e35d429fb 100644 --- a/tests/models/swin/test_modeling_swin.py +++ b/tests/models/swin/test_modeling_swin.py @@ -303,7 +303,8 @@ class SwinModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py index 30a0c3de9a2..9f8376984fd 100644 --- a/tests/models/swin2sr/test_modeling_swin2sr.py +++ b/tests/models/swin2sr/test_modeling_swin2sr.py @@ -263,7 +263,8 @@ class Swin2SRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py index 1324df29473..69f0a77f3a2 100644 --- a/tests/models/swinv2/test_modeling_swinv2.py +++ b/tests/models/swinv2/test_modeling_swinv2.py @@ -286,7 +286,8 @@ class Swinv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py index 6d35c73e3d1..bac298a12ed 100644 --- a/tests/models/table_transformer/test_modeling_table_transformer.py +++ b/tests/models/table_transformer/test_modeling_table_transformer.py @@ -293,7 +293,8 @@ class TableTransformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest. inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py index 42a663e744e..02c7a1111c0 100644 --- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py +++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py @@ -286,7 +286,8 @@ class TimeSeriesTransformerModelTest(ModelTesterMixin, PipelineTesterMixin, unit inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py index 4489ee7f3c7..4537003b099 100644 --- a/tests/models/vilt/test_modeling_vilt.py +++ b/tests/models/vilt/test_modeling_vilt.py @@ -373,7 +373,8 @@ class ViltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py index b214a136350..8b715137a37 100644 --- a/tests/models/visual_bert/test_modeling_visual_bert.py +++ b/tests/models/visual_bert/test_modeling_visual_bert.py @@ -412,7 +412,8 @@ class VisualBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py index 2b71b081fe5..d4d3efe3748 100644 --- a/tests/models/vivit/test_modeling_vivit.py +++ b/tests/models/vivit/test_modeling_vivit.py @@ -243,7 +243,8 @@ class VivitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 37e459db983..161bb33a801 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -676,7 +676,8 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 161e30ee2e4..1a0c7dda6e0 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -236,7 +236,8 @@ class XCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py index d7b13f5846d..3c2a78a8720 100644 --- a/tests/models/yolos/test_modeling_yolos.py +++ b/tests/models/yolos/test_modeling_yolos.py @@ -233,7 +233,8 @@ class YolosModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py index f8574de00f7..2a142bfc73e 100644 --- a/tests/models/zamba/test_modeling_zamba.py +++ b/tests/models/zamba/test_modeling_zamba.py @@ -401,7 +401,8 @@ class ZambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/models/zamba2/test_modeling_zamba2.py b/tests/models/zamba2/test_modeling_zamba2.py index a11ac6106cd..894cde8be33 100644 --- a/tests/models/zamba2/test_modeling_zamba2.py +++ b/tests/models/zamba2/test_modeling_zamba2.py @@ -422,7 +422,8 @@ class Zamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() diff --git a/tests/pipelines/test_pipelines_text_to_audio.py b/tests/pipelines/test_pipelines_text_to_audio.py index 0886aa1426a..8a358f45de1 100644 --- a/tests/pipelines/test_pipelines_text_to_audio.py +++ b/tests/pipelines/test_pipelines_text_to_audio.py @@ -259,6 +259,7 @@ class TextToAudioPipelineTests(unittest.TestCase): model_test_kwargs = {} if model.can_generate(): # not all models in this pipeline can generate and, therefore, take `generate` kwargs model_test_kwargs["max_new_tokens"] = 5 + model.config._attn_implementation = "eager" speech_generator = TextToAudioPipeline( model=model, tokenizer=tokenizer, diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index ddef77eef13..4ddbbcb47f2 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -974,7 +974,8 @@ class ModelTesterMixin: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") + config = model.config model.to(torch_device) model.eval() with torch.no_grad(): @@ -1720,7 +1721,7 @@ class ModelTesterMixin: # no need to test all models as different heads yield the same functionality model_class = self.all_model_classes[0] - model = model_class(config) + model = model_class._from_config(config, attn_implementation="eager") model.to(torch_device) inputs = self._prepare_for_class(inputs_dict, model_class) diff --git a/tests/utils/test_configuration_utils.py b/tests/utils/test_configuration_utils.py index f4ffc3cb117..a34e9a5ea9a 100644 --- a/tests/utils/test_configuration_utils.py +++ b/tests/utils/test_configuration_utils.py @@ -189,6 +189,7 @@ class ConfigTestUtils(unittest.TestCase): self.assertListEqual( missing_keys, [ + "_output_attentions", "is_encoder_decoder", "_name_or_path", "_commit_hash",