From e55983e2b90e10f5f82d5ebbd5a6735f002dbebb Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 9 Jun 2025 22:18:52 +0200 Subject: [PATCH] Fix `aya_vision` test (#38674) * fix 1: load_in_4bit=True, * fix 2: decorateor * fixfix 2: breakpoint * fixfix 3: update * fixfix 4: fast * fixfix 5: cond * fixfix 5: cond * fixfix 6: cuda 8 * ruff * breakpoint * dtype * a10 * a10 --------- Co-authored-by: ydshieh --- .../aya_vision/test_modeling_aya_vision.py | 97 ++++++++++++++----- .../aya_vision/test_processor_aya_vision.py | 3 +- 2 files changed, 74 insertions(+), 26 deletions(-) diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py index d0afad7d17d..e4596591e25 100644 --- a/tests/models/aya_vision/test_modeling_aya_vision.py +++ b/tests/models/aya_vision/test_modeling_aya_vision.py @@ -27,6 +27,7 @@ from transformers import ( from transformers.testing_utils import ( Expectations, cleanup, + get_device_properties, require_deterministic_for_xpu, require_read_token, require_torch, @@ -330,19 +331,39 @@ class AyaVisionModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester @require_read_token @require_torch class AyaVisionIntegrationTest(unittest.TestCase): - def setUp(self): - self.model_checkpoint = "CohereForAI/aya-vision-8b" + @classmethod + def setUpClass(cls): + cls.model_checkpoint = "CohereForAI/aya-vision-8b" + cls.model = None + + @classmethod + def tearDownClass(cls): + del cls.model_checkpoint + cleanup(torch_device, gc_collect=True) def tearDown(self): cleanup(torch_device, gc_collect=True) + @classmethod + def get_model(cls): + # Use 4-bit on T4 + load_in_4bit = get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8 + torch_dtype = None if load_in_4bit else torch.float16 + + if cls.model is None: + cls.model = AyaVisionForConditionalGeneration.from_pretrained( + cls.model_checkpoint, + device_map=torch_device, + torch_dtype=torch_dtype, + load_in_4bit=load_in_4bit, + ) + return cls.model + @slow @require_torch_accelerator def test_small_model_integration_forward(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) - model = AyaVisionForConditionalGeneration.from_pretrained( - self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16 - ) + model = self.get_model() messages = [ { "role": "user", @@ -361,7 +382,17 @@ class AyaVisionIntegrationTest(unittest.TestCase): output = model(**inputs) actual_logits = output.logits[0, -1, :5].cpu() - expected_logits = torch.tensor([0.4109, 0.1532, 0.8018, 2.1328, 0.5483], dtype=torch.float16) + + EXPECTED_LOGITS = Expectations( + { + ("xpu", 3): [0.4109, 0.1532, 0.8018, 2.1328, 0.5483], + # 4-bit + ("cuda", 7): [0.1097, 0.3481, 3.8340, 9.7969, 2.0488], + ("cuda", 8): [1.6396, 0.6094, 3.1992, 8.5234, 2.1875], + } + ) # fmt: skip + expected_logits = torch.tensor(EXPECTED_LOGITS.get_expectation(), dtype=torch.float16) + self.assertTrue( torch.allclose(actual_logits, expected_logits, atol=0.1), f"Actual logits: {actual_logits}" @@ -374,9 +405,7 @@ class AyaVisionIntegrationTest(unittest.TestCase): @require_deterministic_for_xpu def test_small_model_integration_generate_text_only(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) - model = AyaVisionForConditionalGeneration.from_pretrained( - self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16 - ) + model = self.get_model() messages = [ { "role": "user", @@ -398,7 +427,9 @@ class AyaVisionIntegrationTest(unittest.TestCase): expected_outputs = Expectations( { ("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit sky,\nNature's quiet song.", - ("cuda", 7): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.", + # 4-bit + ("cuda", 7): "Sure, here's a haiku for you:\n\nMorning dew sparkles,\nPetals unfold in sunlight,\n", + ("cuda", 8): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.", } ) # fmt: skip expected_output = expected_outputs.get_expectation() @@ -409,9 +440,7 @@ class AyaVisionIntegrationTest(unittest.TestCase): @require_torch_accelerator def test_small_model_integration_generate_chat_template(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) - model = AyaVisionForConditionalGeneration.from_pretrained( - self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16 - ) + model = self.get_model() messages = [ { "role": "user", @@ -430,16 +459,24 @@ class AyaVisionIntegrationTest(unittest.TestCase): decoded_output = processor.decode( generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True ) - expected_output = "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats," # fmt: skip + + expected_outputs = Expectations( + { + ("xpu", 3): "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,", + # 4-bit + ("cuda", 7): 'The image depicts two cats comfortably resting on a pink blanket spread across a sofa. The cats,', + ("cuda", 8): 'The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,', + } + ) # fmt: skip + expected_output = expected_outputs.get_expectation() + self.assertEqual(decoded_output, expected_output) @slow @require_torch_accelerator def test_small_model_integration_batched_generate(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) - model = AyaVisionForConditionalGeneration.from_pretrained( - self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16 - ) + model = self.get_model() # Prepare inputs messages = [ [ @@ -472,7 +509,9 @@ class AyaVisionIntegrationTest(unittest.TestCase): expected_outputs = Expectations( { ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.", - ("cuda", 7): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.", + # 4-bit + ("cuda", 7): "Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene", + ("cuda", 8): 'Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.', } ) # fmt: skip expected_output = expected_outputs.get_expectation() @@ -485,7 +524,16 @@ class AyaVisionIntegrationTest(unittest.TestCase): # Check second output decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True) - expected_output = 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a' # fmt: skip + + expected_outputs = Expectations( + { + ("xpu", 3): 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a', + # 4-bit + ("cuda", 7): 'This vibrant image captures a bustling street scene in a multicultural urban area, featuring a traditional Chinese gate adorned with intricate red and', + ("cuda", 8): 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a', + } + ) # fmt: skip + expected_output = expected_outputs.get_expectation() self.assertEqual( decoded_output, @@ -498,9 +546,7 @@ class AyaVisionIntegrationTest(unittest.TestCase): @require_deterministic_for_xpu def test_small_model_integration_batched_generate_multi_image(self): processor = AutoProcessor.from_pretrained(self.model_checkpoint) - model = AyaVisionForConditionalGeneration.from_pretrained( - self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16 - ) + model = self.get_model() # Prepare inputs messages = [ [ @@ -543,7 +589,8 @@ class AyaVisionIntegrationTest(unittest.TestCase): expected_outputs = Expectations( { ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.", - ("cuda", 7): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.", + ("cuda", 7): 'Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene', + ("cuda", 8): 'Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.', } ) # fmt: skip expected_output = expected_outputs.get_expectation() @@ -559,10 +606,12 @@ class AyaVisionIntegrationTest(unittest.TestCase): expected_outputs = Expectations( { ("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ", - ("cuda", 7): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at a", + ("cuda", 7): 'The first image showcases the Statue of Liberty, a monumental sculpture located on Liberty Island in New York Harbor. Standing atop a', + ("cuda", 8): 'The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ', } ) # fmt: skip expected_output = expected_outputs.get_expectation() + self.assertEqual( decoded_output, expected_output, diff --git a/tests/models/aya_vision/test_processor_aya_vision.py b/tests/models/aya_vision/test_processor_aya_vision.py index e0983d489e2..4e17bea44fa 100644 --- a/tests/models/aya_vision/test_processor_aya_vision.py +++ b/tests/models/aya_vision/test_processor_aya_vision.py @@ -17,7 +17,7 @@ import tempfile import unittest from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor -from transformers.testing_utils import require_read_token, require_torch, require_vision +from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -31,7 +31,6 @@ if is_vision_available(): from transformers import GotOcr2ImageProcessor -@require_read_token @require_vision class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = AyaVisionProcessor