diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py index 785ad723f44..d0afad7d17d 100644 --- a/tests/models/aya_vision/test_modeling_aya_vision.py +++ b/tests/models/aya_vision/test_modeling_aya_vision.py @@ -297,10 +297,6 @@ class AyaVisionModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester def test_multi_gpu_data_parallel_forward(self): pass - @unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different") - def test_sdpa_equivalence(self): - pass - @unittest.skip(reason="SiglipVisionModel does not support standalone training") def test_training(self): pass diff --git a/tests/models/cohere2/test_modeling_cohere2.py b/tests/models/cohere2/test_modeling_cohere2.py index 63c067df57f..195be1c23d8 100644 --- a/tests/models/cohere2/test_modeling_cohere2.py +++ b/tests/models/cohere2/test_modeling_cohere2.py @@ -127,10 +127,6 @@ class Cohere2ModelTest(CohereModelTest, unittest.TestCase): def test_generate_continue_from_inputs_embeds(self): pass - @unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different") - def test_sdpa_equivalence(self): - pass - @slow @require_read_token diff --git a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py index e56dcc7d861..9b796937b08 100644 --- a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py +++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py @@ -300,10 +300,6 @@ class DeepseekV3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste def test_generate_continue_from_inputs_embeds(self): pass - @unittest.skip("DeepseekV3's eager attn/sdpa attn outputs are expected to be different") - def test_sdpa_equivalence(self): - pass - @unittest.skip("Deepseek-V3 uses MLA so it is not compatible with the standard cache format") def test_beam_search_generate_dict_outputs_use_cache(self): pass diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py index ce0aadd1637..0a4cff3c7ff 100644 --- a/tests/models/gemma/test_modeling_gemma.py +++ b/tests/models/gemma/test_modeling_gemma.py @@ -303,38 +303,6 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi def test_flash_attn_2_inference_equivalence_right_padding(self): self.skipTest(reason="Gemma flash attention does not support right padding") - @require_torch_sdpa - @require_torch_accelerator - @slow - def test_sdpa_equivalence(self): - for model_class in self.all_model_classes: - if not model_class._supports_sdpa: - self.skipTest(reason="Model does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model_sdpa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.float16, attn_implementation="sdpa" - ) - model_sdpa.to(torch_device) - - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, attn_implementation="eager") - model.to(torch_device) - - dummy_input = inputs_dict[model_class.main_input_name] - dummy_input = dummy_input.to(torch_device) - outputs = model(dummy_input, output_hidden_states=True) - outputs_sdpa = model_sdpa(dummy_input, output_hidden_states=True) - - logits = outputs.hidden_states[-1] - logits_sdpa = outputs_sdpa.hidden_states[-1] - - # gemma sdpa needs a high tolerance - assert torch.allclose(logits_sdpa, logits, atol=3e-3) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py index d08a5ee6a7f..6ee4e8f2327 100644 --- a/tests/models/gemma2/test_modeling_gemma2.py +++ b/tests/models/gemma2/test_modeling_gemma2.py @@ -143,10 +143,6 @@ class Gemma2ModelTest(GemmaModelTest, unittest.TestCase): def test_generate_continue_from_inputs_embeds(self): pass - @unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different") - def test_sdpa_equivalence(self): - pass - @unittest.skip( reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`" " as in Dynamic Cache doesn't work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting" diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py index f7dcf273252..6dd2fb5cd65 100644 --- a/tests/models/nemotron/test_modeling_nemotron.py +++ b/tests/models/nemotron/test_modeling_nemotron.py @@ -28,7 +28,6 @@ from transformers.testing_utils import ( require_torch, require_torch_accelerator, require_torch_gpu, - require_torch_sdpa, slow, torch_device, ) @@ -102,38 +101,6 @@ class NemotronModelTest(GemmaModelTest): def test_model_outputs_equivalence(self, **kwargs): pass - @require_torch_sdpa - @require_torch_accelerator - @slow - def test_sdpa_equivalence(self): - for model_class in self.all_model_classes: - if not model_class._supports_sdpa: - self.skipTest(reason="Model does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model_sdpa = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.float16, attn_implementation="sdpa" - ) - model_sdpa.to(torch_device) - - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, attn_implementation="eager") - model.to(torch_device) - - dummy_input = inputs_dict[model_class.main_input_name] - dummy_input = dummy_input.to(torch_device) - outputs = model(dummy_input, output_hidden_states=True) - outputs_sdpa = model_sdpa(dummy_input, output_hidden_states=True) - - logits = outputs.hidden_states[-1] - logits_sdpa = outputs_sdpa.hidden_states[-1] - - # nemotron sdpa needs a high tolerance - assert torch.allclose(logits_sdpa, logits, atol=1e-2) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test