enable blip2 and emu3 cases on XPU (#37662)

* enable blip2 and emu3 modeling cases on XPU

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* fix style

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* remove extra new line

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* update

---------

Signed-off-by: YAO Matrix <matrix.yao@intel.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yao Matrix 2025-04-23 00:37:09 +08:00 committed by GitHub
parent ca4c114dc4
commit ece79b0688
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 28 additions and 12 deletions

View File

@ -27,7 +27,6 @@ from transformers.testing_utils import (
require_torch,
require_torch_accelerator,
require_torch_fp16,
require_torch_gpu,
require_torch_multi_accelerator,
require_torch_sdpa,
require_vision,
@ -1400,7 +1399,7 @@ class Blip2VisionModelWithProjectionTest(ModelTesterMixin, unittest.TestCase):
self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
@slow
@require_torch_gpu
@require_torch_accelerator
def test_model_from_pretrained(self):
model_name = "Salesforce/blip2-itm-vit-g"
model = Blip2VisionModelWithProjection.from_pretrained(model_name)
@ -1551,7 +1550,7 @@ class Blip2TextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
@slow
@require_torch_gpu
@require_torch_accelerator
def test_model_from_pretrained(self):
model_name = "Salesforce/blip2-itm-vit-g"
model = Blip2ForImageTextRetrieval.from_pretrained(model_name)

View File

@ -23,9 +23,10 @@ from parameterized import parameterized
from transformers import Emu3Config, Emu3TextConfig, is_torch_available, is_vision_available, set_seed
from transformers.testing_utils import (
Expectations,
require_bitsandbytes,
require_torch,
require_torch_large_gpu,
require_torch_large_accelerator,
slow,
torch_device,
)
@ -416,7 +417,7 @@ class Emu3IntegrationTest(unittest.TestCase):
@slow
@require_bitsandbytes
@require_torch_large_gpu
@require_torch_large_accelerator
def test_model_generation_batched(self):
model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True)
processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
@ -434,17 +435,27 @@ class Emu3IntegrationTest(unittest.TestCase):
)
# greedy generation outputs
EXPECTED_TEXT_COMPLETION = [
"USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and curved, with its head lowered and ears pointed forward, suggesting alertness or focus.",
'USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a steep, reddish-brown cliff, which could be a'
] # fmt: skip
EXPECTED_TEXT_COMPLETIONS = Expectations(
{
("xpu", 3): [
"USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and its head is lowered, suggesting a state of alertness or readiness. The animal's",
"USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a gently sloping hill with a reddish-brown hue,",
],
("cuda", 7): [
"USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and curved, with its head lowered and ears pointed forward, suggesting alertness or focus.",
"USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a steep, reddish-brown cliff, which could be a",
],
}
) # fmt: skip
EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
@slow
@require_bitsandbytes
@require_torch_large_gpu
@require_torch_large_accelerator
def test_model_generation_multi_image(self):
model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True)
processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")
@ -456,14 +467,20 @@ class Emu3IntegrationTest(unittest.TestCase):
inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16)
# greedy generation outputs
EXPECTED_TEXT_COMPLETION = ["USER: 64*6464*64What do these two images have in common? ASSISTANT: Both images feature a black animal, but they are not the same animal. The top image shows a close-up of a black cow's head, while the bottom image depicts a black cow in a natural"] # fmt: skip
EXPECTED_TEXT_COMPLETIONS = Expectations(
{
("xpu", 3): ['USER: 64*6464*64What do these two images have in common? ASSISTANT: The two images both depict a rhinoceros, yet they are significantly different in terms of focus and clarity. The rhinoceros in the upper image is in sharp focus, showing detailed textures'],
("cuda", 7): ["USER: 64*6464*64What do these two images have in common? ASSISTANT: Both images feature a black animal, but they are not the same animal. The top image shows a close-up of a black cow's head, while the bottom image depicts a black cow in a natural"],
}
) # fmt: skip
EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
@slow
@require_bitsandbytes
@require_torch_large_gpu
@require_torch_large_accelerator
def test_model_generate_images(self):
model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Gen-hf", load_in_4bit=True)
processor = Emu3Processor.from_pretrained("BAAI/Emu3-Gen-hf")