enable mllama cases on xpu (#37644)

* enable mllama testing on xpu Signed-off-by: YAO Matrix <matrix.yao@intel.com> * more mllama cases enabling Signed-off-by: YAO Matrix <matrix.yao@intel.com> * make cases pass on A100 Signed-off-by: N <matrix.yao@intel.com> --------- Signed-off-by: YAO Matrix <matrix.yao@intel.com> Signed-off-by: N <matrix.yao@intel.com>
2025-08-01 02:31:11 +06:00 · 2025-04-22 23:39:10 +08:00 · 2025-04-22 23:39:10 +08:00 · 9167461a7d
commit 9167461a7d
parent de182ba269
1 changed files with 69 additions and 16 deletions
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@ -31,11 +31,12 @@ from transformers import (
 from transformers.cache_utils import Cache
 from transformers.models.mllama.configuration_mllama import MllamaTextConfig
 from transformers.testing_utils import (
+    Expectations,
    cleanup,
    require_bitsandbytes,
    require_read_token,
    require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@ -524,7 +525,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
        cleanup(torch_device, gc_collect=True)

    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    @require_bitsandbytes
    @require_read_token
    def test_11b_model_integration_generate(self):
@ -537,9 +538,18 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):

        inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch_device)

+        input_ids = inputs["input_ids"]
+
        # Check inputs ids
-        expected_input_ids = torch.tensor([[128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device)  # fmt: skip
-        self.assertTrue(torch.equal(inputs["input_ids"], expected_input_ids))
+        expected_input_ids_all = Expectations(
+            {
+                ("xpu", 3): torch.tensor([[128000, 128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device),
+                ("cuda", 7): torch.tensor([[128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device),
+                ("cuda", 8): torch.tensor([[128000, 128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device),
+            }
+        )  # fmt: skip
+        expected_input_ids = expected_input_ids_all.get_expectation()
+        self.assertTrue(torch.equal(input_ids, expected_input_ids))

        # Load model in 4 bit
        quantization_config = BitsAndBytesConfig(load_in_4bit=True)
@ -551,7 +561,14 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)

        decoded_output = processor.decode(output[0], skip_special_tokens=True)
-        expected_output = "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a"  # fmt: skip
+        expected_outputs = Expectations(
+                {
+                    ("xpu", 3): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
+                    ("cuda", 7): "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a",
+                    ("cuda", 8): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
+                }
+            )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()

        self.assertEqual(
            decoded_output,
@ -560,7 +577,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
        )

    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    @require_bitsandbytes
    @require_read_token
    def test_11b_model_integration_generate_text_only(self):
@ -568,10 +585,18 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
        processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
        prompt = "If I had to write a haiku"
        inputs = processor(text=prompt, return_tensors="pt").to(torch_device)
+        input_ids = inputs["input_ids"].cpu().squeeze().tolist()

        # Check inputs ids
-        expected_input_ids = [128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342]
-        self.assertEqual(inputs["input_ids"].cpu().squeeze().tolist(), expected_input_ids)
+        expected_input_ids_all = Expectations(
+            {
+                ("xpu", 3): [128000, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342],
+                ("cuda", 7): [128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342],
+                ("cuda", 8): [128000, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342],
+            }
+        )
+        expected_input_ids = expected_input_ids_all.get_expectation()
+        self.assertEqual(input_ids, expected_input_ids)

        # Load model in 4 bit
        quantization_config = BitsAndBytesConfig(load_in_4bit=True)
@ -583,8 +608,14 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)

        decoded_output = processor.decode(output[0], skip_special_tokens=True)
-        expected_output = "If I had to write a haiku about my life, I think it would be something like:\n\"Life is a messy stream\nTwists and turns, ups"  # fmt: skip
-
+        expected_outputs = Expectations(
+                {
+                    ("xpu", 3): "If I had to write a haiku about my life, I would write:\nLife is a messy tapestry\n Threads of joy and sorrow\nWeft of memories",
+                    ("cuda", 7): "If I had to write a haiku about my life, I think it would be something like:\n\"Life is a messy stream\nTwists and turns, ups",
+                    ("cuda", 8): "If I had to write a haiku about my life, I would write:\nLife is a messy stream\nRipples of joy and pain\nFlowing, ever",
+                }
+            )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()
        self.assertEqual(
            decoded_output,
            expected_output,
@ -592,7 +623,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
        )

    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    @require_bitsandbytes
    @require_read_token
    def test_11b_model_integration_forward(self):
@ -616,7 +647,15 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
            output = model(**inputs)

        actual_logits = output.logits[0, -1, :5].cpu()
-        expected_logits = torch.tensor([8.3594, 7.7148, 4.7266, 0.7803, 3.1504])
+        expected_logits_all = Expectations(
+            {
+                ("xpu", 3): torch.tensor([9.1562, 8.9141, 5.0664, 1.6855, 3.2324]),
+                ("cuda", 7): torch.tensor([8.3594, 7.7148, 4.7266, 0.7803, 3.1504]),
+                ("cuda", 8): torch.tensor([9.0703, 8.8750, 5.0781, 1.6279, 3.2207]),
+            }
+        )
+
+        expected_logits = expected_logits_all.get_expectation()
        self.assertTrue(
            torch.allclose(actual_logits, expected_logits, atol=0.1),
            f"Actual logits: {actual_logits}"
@ -625,7 +664,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
        )

    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    @require_bitsandbytes
    @require_read_token
    def test_11b_model_integration_batched_generate(self):
@ -653,7 +692,14 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):

        # Check first output
        decoded_output = processor.decode(output[0], skip_special_tokens=True)
-        expected_output = "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a"  # fmt: skip
+        expected_outputs = Expectations(
+                {
+                    ("xpu", 3): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
+                    ("cuda", 7): "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a",
+                    ("cuda", 8): "If I had to write a haiku for this one, it would be:.\\nA dock on a lake.\\nA mountain in the distance.\\nA long exposure.",
+                 }
+            )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()

        self.assertEqual(
            decoded_output,
@ -663,7 +709,14 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):

        # Check second output
        decoded_output = processor.decode(output[1], skip_special_tokens=True)
-        expected_output = "This image shows is a photograph of a stop sign in front of a Chinese archway. The stop sign is red with white letters and is"  # fmt: skip
+        expected_outputs = Expectations(
+                {
+                    ("xpu", 3): "This image shows\nI'm not able to provide information on the person in this image. I can give you an idea of what's happening",
+                    ("cuda", 7): "This image shows is a photograph of a stop sign in front of a Chinese archway. The stop sign is red with white letters and is",
+                    ("cuda", 8): "This image shows\nI'm not able to provide information on the person in this image. I can give you an idea of what's happening",
+                }
+            )  # fmt: skip
+        expected_output = expected_outputs.get_expectation()

        self.assertEqual(
            decoded_output,
@ -672,7 +725,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
        )

    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    @require_bitsandbytes
    @require_read_token
    def test_11b_model_integration_multi_image_generate(self):