enable blip2 and emu3 cases on XPU (#37662)

* enable blip2 and emu3 modeling cases on XPU Signed-off-by: YAO Matrix <matrix.yao@intel.com> * fix style Signed-off-by: YAO Matrix <matrix.yao@intel.com> * remove extra new line Signed-off-by: YAO Matrix <matrix.yao@intel.com> * update --------- Signed-off-by: YAO Matrix <matrix.yao@intel.com> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-07-03 12:50:06 +06:00 · 2025-04-23 00:37:09 +08:00 · 2025-04-23 00:37:09 +08:00 · ece79b0688
commit ece79b0688
parent ca4c114dc4
2 changed files with 28 additions and 12 deletions
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@ -27,7 +27,6 @@ from transformers.testing_utils import (
    require_torch,
    require_torch_accelerator,
    require_torch_fp16,
-    require_torch_gpu,
    require_torch_multi_accelerator,
    require_torch_sdpa,
    require_vision,
@ -1400,7 +1399,7 @@ class Blip2VisionModelWithProjectionTest(ModelTesterMixin, unittest.TestCase):
            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)

    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    def test_model_from_pretrained(self):
        model_name = "Salesforce/blip2-itm-vit-g"
        model = Blip2VisionModelWithProjection.from_pretrained(model_name)
@ -1551,7 +1550,7 @@ class Blip2TextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
            self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())

    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    def test_model_from_pretrained(self):
        model_name = "Salesforce/blip2-itm-vit-g"
        model = Blip2ForImageTextRetrieval.from_pretrained(model_name)
--- a/tests/models/emu3/test_modeling_emu3.py
+++ b/tests/models/emu3/test_modeling_emu3.py
@ -23,9 +23,10 @@ from parameterized import parameterized

 from transformers import Emu3Config, Emu3TextConfig, is_torch_available, is_vision_available, set_seed
 from transformers.testing_utils import (
+    Expectations,
    require_bitsandbytes,
    require_torch,
-    require_torch_large_gpu,
+    require_torch_large_accelerator,
    slow,
    torch_device,
 )
@ -416,7 +417,7 @@ class Emu3IntegrationTest(unittest.TestCase):

    @slow
    @require_bitsandbytes
-    @require_torch_large_gpu
+    @require_torch_large_accelerator
    def test_model_generation_batched(self):
        model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True)
        processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
@ -434,17 +435,27 @@ class Emu3IntegrationTest(unittest.TestCase):
        )

        # greedy generation outputs
-        EXPECTED_TEXT_COMPLETION = [
-            "USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and curved, with its head lowered and ears pointed forward, suggesting alertness or focus.",
-            'USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a steep, reddish-brown cliff, which could be a'
-        ]  # fmt: skip
+        EXPECTED_TEXT_COMPLETIONS = Expectations(
+            {
+                ("xpu", 3): [
+                    "USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and its head is lowered, suggesting a state of alertness or readiness. The animal's",
+                    "USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a gently sloping hill with a reddish-brown hue,",
+                ],
+                ("cuda", 7): [
+                    "USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and curved, with its head lowered and ears pointed forward, suggesting alertness or focus.",
+                    "USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a steep, reddish-brown cliff, which could be a",
+                ],
+            }
+        )  # fmt: skip
+        EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
+
        generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
        text = processor.batch_decode(generated_ids, skip_special_tokens=True)
        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

    @slow
    @require_bitsandbytes
-    @require_torch_large_gpu
+    @require_torch_large_accelerator
    def test_model_generation_multi_image(self):
        model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True)
        processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
@ -456,14 +467,20 @@ class Emu3IntegrationTest(unittest.TestCase):
        inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16)

        # greedy generation outputs
-        EXPECTED_TEXT_COMPLETION = ["USER: 64*6464*64What do these two images have in common? ASSISTANT: Both images feature a black animal, but they are not the same animal. The top image shows a close-up of a black cow's head, while the bottom image depicts a black cow in a natural"]  # fmt: skip
+        EXPECTED_TEXT_COMPLETIONS = Expectations(
+                {
+                    ("xpu", 3): ['USER: 64*6464*64What do these two images have in common? ASSISTANT: The two images both depict a rhinoceros, yet they are significantly different in terms of focus and clarity. The rhinoceros in the upper image is in sharp focus, showing detailed textures'],
+                    ("cuda", 7): ["USER: 64*6464*64What do these two images have in common? ASSISTANT: Both images feature a black animal, but they are not the same animal. The top image shows a close-up of a black cow's head, while the bottom image depicts a black cow in a natural"],
+                }
+            )  # fmt: skip
+        EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
        generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
        text = processor.batch_decode(generated_ids, skip_special_tokens=True)
        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

    @slow
    @require_bitsandbytes
-    @require_torch_large_gpu
+    @require_torch_large_accelerator
    def test_model_generate_images(self):
        model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Gen-hf", load_in_4bit=True)
        processor = Emu3Processor.from_pretrained("BAAI/Emu3-Gen-hf")