From e55983e2b90e10f5f82d5ebbd5a6735f002dbebb Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 9 Jun 2025 22:18:52 +0200
Subject: [PATCH] Fix `aya_vision` test (#38674)

* fix 1: load_in_4bit=True,

* fix 2: decorateor

* fixfix 2: breakpoint

* fixfix 3: update

* fixfix 4: fast

* fixfix 5: cond

* fixfix 5: cond

* fixfix 6: cuda 8

* ruff

* breakpoint

* dtype

* a10

* a10

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../aya_vision/test_modeling_aya_vision.py    | 97 ++++++++++++++-----
 .../aya_vision/test_processor_aya_vision.py   |  3 +-
 2 files changed, 74 insertions(+), 26 deletions(-)

diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py
index d0afad7d17d..e4596591e25 100644
--- a/tests/models/aya_vision/test_modeling_aya_vision.py
+++ b/tests/models/aya_vision/test_modeling_aya_vision.py
@@ -27,6 +27,7 @@ from transformers import (
 from transformers.testing_utils import (
     Expectations,
     cleanup,
+    get_device_properties,
     require_deterministic_for_xpu,
     require_read_token,
     require_torch,
@@ -330,19 +331,39 @@ class AyaVisionModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
 @require_read_token
 @require_torch
 class AyaVisionIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.model_checkpoint = "CohereForAI/aya-vision-8b"
+    @classmethod
+    def setUpClass(cls):
+        cls.model_checkpoint = "CohereForAI/aya-vision-8b"
+        cls.model = None
+
+    @classmethod
+    def tearDownClass(cls):
+        del cls.model_checkpoint
+        cleanup(torch_device, gc_collect=True)
 
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
+    @classmethod
+    def get_model(cls):
+        # Use 4-bit on T4
+        load_in_4bit = get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8
+        torch_dtype = None if load_in_4bit else torch.float16
+
+        if cls.model is None:
+            cls.model = AyaVisionForConditionalGeneration.from_pretrained(
+                cls.model_checkpoint,
+                device_map=torch_device,
+                torch_dtype=torch_dtype,
+                load_in_4bit=load_in_4bit,
+            )
+        return cls.model
+
     @slow
     @require_torch_accelerator
     def test_small_model_integration_forward(self):
         processor = AutoProcessor.from_pretrained(self.model_checkpoint)
-        model = AyaVisionForConditionalGeneration.from_pretrained(
-            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
-        )
+        model = self.get_model()
         messages = [
             {
                 "role": "user",
@@ -361,7 +382,17 @@ class AyaVisionIntegrationTest(unittest.TestCase):
             output = model(**inputs)
 
         actual_logits = output.logits[0, -1, :5].cpu()
-        expected_logits = torch.tensor([0.4109, 0.1532, 0.8018, 2.1328, 0.5483], dtype=torch.float16)
+
+        EXPECTED_LOGITS = Expectations(
+            {
+                ("xpu", 3): [0.4109, 0.1532, 0.8018, 2.1328, 0.5483],
+                # 4-bit
+                ("cuda", 7): [0.1097, 0.3481, 3.8340, 9.7969, 2.0488],
+                ("cuda", 8): [1.6396, 0.6094, 3.1992, 8.5234, 2.1875],
+            }
+        )  # fmt: skip
+        expected_logits = torch.tensor(EXPECTED_LOGITS.get_expectation(), dtype=torch.float16)
+
         self.assertTrue(
             torch.allclose(actual_logits, expected_logits, atol=0.1),
             f"Actual logits: {actual_logits}"
@@ -374,9 +405,7 @@ class AyaVisionIntegrationTest(unittest.TestCase):
     @require_deterministic_for_xpu
     def test_small_model_integration_generate_text_only(self):
         processor = AutoProcessor.from_pretrained(self.model_checkpoint)
-        model = AyaVisionForConditionalGeneration.from_pretrained(
-            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
-        )
+        model = self.get_model()
         messages = [
             {
                 "role": "user",
@@ -398,7 +427,9 @@ class AyaVisionIntegrationTest(unittest.TestCase):
         expected_outputs = Expectations(
             {
                 ("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit sky,\nNature's quiet song.",
-                ("cuda", 7): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.",
+                # 4-bit
+                ("cuda", 7): "Sure, here's a haiku for you:\n\nMorning dew sparkles,\nPetals unfold in sunlight,\n",
+                ("cuda", 8): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.",
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
@@ -409,9 +440,7 @@ class AyaVisionIntegrationTest(unittest.TestCase):
     @require_torch_accelerator
     def test_small_model_integration_generate_chat_template(self):
         processor = AutoProcessor.from_pretrained(self.model_checkpoint)
-        model = AyaVisionForConditionalGeneration.from_pretrained(
-            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
-        )
+        model = self.get_model()
         messages = [
             {
                 "role": "user",
@@ -430,16 +459,24 @@ class AyaVisionIntegrationTest(unittest.TestCase):
             decoded_output = processor.decode(
                 generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
             )
-        expected_output = "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,"  # fmt: skip
+
+        expected_outputs = Expectations(
+            {
+                ("xpu", 3): "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,",
+                # 4-bit
+                ("cuda", 7): 'The image depicts two cats comfortably resting on a pink blanket spread across a sofa. The cats,',
+                ("cuda", 8): 'The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,',
+            }
+        )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()
+
         self.assertEqual(decoded_output, expected_output)
 
     @slow
     @require_torch_accelerator
     def test_small_model_integration_batched_generate(self):
         processor = AutoProcessor.from_pretrained(self.model_checkpoint)
-        model = AyaVisionForConditionalGeneration.from_pretrained(
-            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
-        )
+        model = self.get_model()
         # Prepare inputs
         messages = [
             [
@@ -472,7 +509,9 @@ class AyaVisionIntegrationTest(unittest.TestCase):
         expected_outputs = Expectations(
             {
                 ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
-                ("cuda", 7): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.",
+                # 4-bit
+                ("cuda", 7): "Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene",
+                ("cuda", 8): 'Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.',
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
@@ -485,7 +524,16 @@ class AyaVisionIntegrationTest(unittest.TestCase):
 
         # Check second output
         decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
-        expected_output = 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a'  # fmt: skip
+
+        expected_outputs = Expectations(
+            {
+                ("xpu", 3): 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a',
+                # 4-bit
+                ("cuda", 7): 'This vibrant image captures a bustling street scene in a multicultural urban area, featuring a traditional Chinese gate adorned with intricate red and',
+                ("cuda", 8): 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a',
+            }
+        )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()
 
         self.assertEqual(
             decoded_output,
@@ -498,9 +546,7 @@ class AyaVisionIntegrationTest(unittest.TestCase):
     @require_deterministic_for_xpu
     def test_small_model_integration_batched_generate_multi_image(self):
         processor = AutoProcessor.from_pretrained(self.model_checkpoint)
-        model = AyaVisionForConditionalGeneration.from_pretrained(
-            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
-        )
+        model = self.get_model()
         # Prepare inputs
         messages = [
             [
@@ -543,7 +589,8 @@ class AyaVisionIntegrationTest(unittest.TestCase):
         expected_outputs = Expectations(
             {
                 ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
-                ("cuda", 7): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.",
+                ("cuda", 7): 'Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene',
+                ("cuda", 8): 'Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.',
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
@@ -559,10 +606,12 @@ class AyaVisionIntegrationTest(unittest.TestCase):
         expected_outputs = Expectations(
             {
                 ("xpu", 3): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ",
-                ("cuda", 7): "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at a",
+                ("cuda", 7): 'The first image showcases the Statue of Liberty, a monumental sculpture located on Liberty Island in New York Harbor. Standing atop a',
+                ("cuda", 8): 'The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at ',
             }
         )  # fmt: skip
         expected_output = expected_outputs.get_expectation()
+
         self.assertEqual(
             decoded_output,
             expected_output,
diff --git a/tests/models/aya_vision/test_processor_aya_vision.py b/tests/models/aya_vision/test_processor_aya_vision.py
index e0983d489e2..4e17bea44fa 100644
--- a/tests/models/aya_vision/test_processor_aya_vision.py
+++ b/tests/models/aya_vision/test_processor_aya_vision.py
@@ -17,7 +17,7 @@ import tempfile
 import unittest
 
 from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor
-from transformers.testing_utils import require_read_token, require_torch, require_vision
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
@@ -31,7 +31,6 @@ if is_vision_available():
     from transformers import GotOcr2ImageProcessor
 
 
-@require_read_token
 @require_vision
 class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = AyaVisionProcessor