mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Fix some fa2 tests (#35340)
* remove fa2 test * remove other failing tests * style
This commit is contained in:
parent
667ed5635e
commit
1fa807fa63
@ -14,14 +14,12 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch Granite model."""
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import GraniteConfig, is_torch_available, set_seed
|
||||
from transformers.testing_utils import (
|
||||
require_flash_attn,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
@ -417,33 +415,6 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
|
||||
with self.assertRaises(AssertionError):
|
||||
torch.testing.assert_close(yarn_sin_long, original_sin_long)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@slow
|
||||
def test_use_flash_attention_2_true(self):
|
||||
"""
|
||||
NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended.
|
||||
"""
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model = model_class(config)
|
||||
model.save_pretrained(tmp_dir)
|
||||
|
||||
new_model = GraniteForCausalLM.from_pretrained(
|
||||
tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
self.assertTrue(new_model.config._attn_implementation == "flash_attention_2")
|
||||
|
||||
has_flash = False
|
||||
for name, submodule in new_model.named_modules():
|
||||
if "FlashAttention" in submodule.__class__.__name__:
|
||||
has_flash = True
|
||||
break
|
||||
if not has_flash:
|
||||
raise ValueError("The flash model should have flash attention layers")
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class GraniteIntegrationTest(unittest.TestCase):
|
||||
|
@ -14,14 +14,12 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch GraniteMoe model."""
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed
|
||||
from transformers.testing_utils import (
|
||||
require_flash_attn,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
@ -416,33 +414,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
|
||||
with self.assertRaises(AssertionError):
|
||||
torch.testing.assert_close(yarn_sin_long, original_sin_long)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@slow
|
||||
def test_use_flash_attention_2_true(self):
|
||||
"""
|
||||
NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended.
|
||||
"""
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model = model_class(config)
|
||||
model.save_pretrained(tmp_dir)
|
||||
|
||||
new_model = GraniteMoeForCausalLM.from_pretrained(
|
||||
tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
self.assertTrue(new_model.config._attn_implementation == "flash_attention_2")
|
||||
|
||||
has_flash = False
|
||||
for name, submodule in new_model.named_modules():
|
||||
if "FlashAttention" in submodule.__class__.__name__:
|
||||
has_flash = True
|
||||
break
|
||||
if not has_flash:
|
||||
raise ValueError("The flash model should have flash attention layers")
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class GraniteMoeIntegrationTest(unittest.TestCase):
|
||||
|
@ -14,10 +14,8 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch LLaMA model."""
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
from packaging import version
|
||||
from parameterized import parameterized
|
||||
|
||||
@ -25,7 +23,6 @@ from transformers import AutoTokenizer, LlamaConfig, StaticCache, is_torch_avail
|
||||
from transformers.generation.configuration_utils import GenerationConfig
|
||||
from transformers.testing_utils import (
|
||||
cleanup,
|
||||
require_flash_attn,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
@ -543,34 +540,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
|
||||
with self.assertRaises(KeyError):
|
||||
config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor"
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@slow
|
||||
@pytest.mark.flash_attn_test
|
||||
def test_use_flash_attention_2_true(self):
|
||||
"""
|
||||
NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended.
|
||||
"""
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model = model_class(config)
|
||||
model.save_pretrained(tmp_dir)
|
||||
|
||||
new_model = LlamaForCausalLM.from_pretrained(
|
||||
tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
self.assertTrue(new_model.config._attn_implementation == "flash_attention_2")
|
||||
|
||||
has_flash = False
|
||||
for name, submodule in new_model.named_modules():
|
||||
if "FlashAttention" in submodule.__class__.__name__:
|
||||
has_flash = True
|
||||
break
|
||||
if not has_flash:
|
||||
raise ValueError("The flash model should have flash attention layers")
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class LlamaIntegrationTest(unittest.TestCase):
|
||||
|
@ -2769,8 +2769,6 @@ class ModelTesterMixin:
|
||||
attributes = tuple([f"{name}_{idx}" for idx in range(len(fx_outputs))])
|
||||
|
||||
for fx_output, pt_output, attr in zip(fx_outputs, pt_outputs, attributes):
|
||||
if isinstance(pt_output, DynamicCache):
|
||||
pt_output = pt_output.to_legacy_cache()
|
||||
self.check_pt_flax_outputs(fx_output, pt_output, model_class, tol=tol, name=attr)
|
||||
|
||||
elif isinstance(fx_outputs, jnp.ndarray):
|
||||
@ -3612,34 +3610,6 @@ class ModelTesterMixin:
|
||||
num_params < 1000000
|
||||
), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
def test_flash_attn_2_conversion(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
if not model_class._supports_flash_attn_2:
|
||||
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
|
||||
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2"
|
||||
).to(torch_device)
|
||||
|
||||
for _, module in model.named_modules():
|
||||
if "FlashAttention" in module.__class__.__name__:
|
||||
return
|
||||
|
||||
self.assertTrue(False, "FlashAttention2 modules not found in model")
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
|
Loading…
Reference in New Issue
Block a user