diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 76a9fdfe5fb..b7dc7e541c9 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -27,7 +27,6 @@ from transformers.testing_utils import ( require_torch, require_torch_accelerator, require_torch_fp16, - require_torch_gpu, require_torch_multi_accelerator, require_torch_sdpa, require_vision, @@ -1400,7 +1399,7 @@ class Blip2VisionModelWithProjectionTest(ModelTesterMixin, unittest.TestCase): self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) @slow - @require_torch_gpu + @require_torch_accelerator def test_model_from_pretrained(self): model_name = "Salesforce/blip2-itm-vit-g" model = Blip2VisionModelWithProjection.from_pretrained(model_name) @@ -1551,7 +1550,7 @@ class Blip2TextRetrievalModelTest(ModelTesterMixin, unittest.TestCase): self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict()) @slow - @require_torch_gpu + @require_torch_accelerator def test_model_from_pretrained(self): model_name = "Salesforce/blip2-itm-vit-g" model = Blip2ForImageTextRetrieval.from_pretrained(model_name) diff --git a/tests/models/emu3/test_modeling_emu3.py b/tests/models/emu3/test_modeling_emu3.py index 8f71e8a42c3..b27b1d4c708 100644 --- a/tests/models/emu3/test_modeling_emu3.py +++ b/tests/models/emu3/test_modeling_emu3.py @@ -23,9 +23,10 @@ from parameterized import parameterized from transformers import Emu3Config, Emu3TextConfig, is_torch_available, is_vision_available, set_seed from transformers.testing_utils import ( + Expectations, require_bitsandbytes, require_torch, - require_torch_large_gpu, + require_torch_large_accelerator, slow, torch_device, ) @@ -416,7 +417,7 @@ class Emu3IntegrationTest(unittest.TestCase): @slow @require_bitsandbytes - @require_torch_large_gpu + @require_torch_large_accelerator def test_model_generation_batched(self): model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True) processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf") @@ -434,17 +435,27 @@ class Emu3IntegrationTest(unittest.TestCase): ) # greedy generation outputs - EXPECTED_TEXT_COMPLETION = [ - "USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and curved, with its head lowered and ears pointed forward, suggesting alertness or focus.", - 'USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a steep, reddish-brown cliff, which could be a' - ] # fmt: skip + EXPECTED_TEXT_COMPLETIONS = Expectations( + { + ("xpu", 3): [ + "USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and its head is lowered, suggesting a state of alertness or readiness. The animal's", + "USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a gently sloping hill with a reddish-brown hue,", + ], + ("cuda", 7): [ + "USER: 64*64Describe what do you see here? ASSISTANT: The image depicts a black panther in a crouched position. The panther's body is elongated and curved, with its head lowered and ears pointed forward, suggesting alertness or focus.", + "USER: 64*64What can you say about the image? ASSISTANT: The image depicts a serene natural landscape. The foreground consists of a grassy area with some patches of bare earth. The middle ground shows a steep, reddish-brown cliff, which could be a", + ], + } + ) # fmt: skip + EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation() + generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False) text = processor.batch_decode(generated_ids, skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT_COMPLETION, text) @slow @require_bitsandbytes - @require_torch_large_gpu + @require_torch_large_accelerator def test_model_generation_multi_image(self): model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", load_in_4bit=True) processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf") @@ -456,14 +467,20 @@ class Emu3IntegrationTest(unittest.TestCase): inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16) # greedy generation outputs - EXPECTED_TEXT_COMPLETION = ["USER: 64*6464*64What do these two images have in common? ASSISTANT: Both images feature a black animal, but they are not the same animal. The top image shows a close-up of a black cow's head, while the bottom image depicts a black cow in a natural"] # fmt: skip + EXPECTED_TEXT_COMPLETIONS = Expectations( + { + ("xpu", 3): ['USER: 64*6464*64What do these two images have in common? ASSISTANT: The two images both depict a rhinoceros, yet they are significantly different in terms of focus and clarity. The rhinoceros in the upper image is in sharp focus, showing detailed textures'], + ("cuda", 7): ["USER: 64*6464*64What do these two images have in common? ASSISTANT: Both images feature a black animal, but they are not the same animal. The top image shows a close-up of a black cow's head, while the bottom image depicts a black cow in a natural"], + } + ) # fmt: skip + EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation() generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False) text = processor.batch_decode(generated_ids, skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT_COMPLETION, text) @slow @require_bitsandbytes - @require_torch_large_gpu + @require_torch_large_accelerator def test_model_generate_images(self): model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Gen-hf", load_in_4bit=True) processor = Emu3Processor.from_pretrained("BAAI/Emu3-Gen-hf")