Update some tests for torch 2.7.1 (#38701)

* fix 1 * fix 2 * fix 3 * fix 4 * fp16 * break * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-07-03 12:50:06 +06:00 · 2025-06-10 11:46:52 +02:00 · 2025-06-10 11:46:52 +02:00 · 04cdf83244
commit 04cdf83244
parent afdb821318
5 changed files with 88 additions and 70 deletions
--- a/tests/models/chameleon/test_modeling_chameleon.py
+++ b/tests/models/chameleon/test_modeling_chameleon.py
@ -463,11 +463,11 @@ class ChameleonIntegrationTest(unittest.TestCase):
                    'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
                ],
                ("cuda", 7): [
-                    'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue line extending across the center of the image. The line is labeled "390 light years" and is accompanied by a small black and',
+                    'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot representing the position of the star Alpha Centauri. Alpha Centauri is the brightest star in the constellation Centaurus and is located',
                    'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
                ],
                ("cuda", 8): [
-                    'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in',
+                    'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot representing the position of the star Alpha Centauri. Alpha Centauri is the brightest star in the constellation Centaurus and is located',
                    'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
                ],
            }
--- a/tests/models/colqwen2/test_modeling_colqwen2.py
+++ b/tests/models/colqwen2/test_modeling_colqwen2.py
@ -299,7 +299,7 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
        """
        model = ColQwen2ForRetrieval.from_pretrained(
            self.model_name,
-            torch_dtype=torch.bfloat16,
+            torch_dtype=torch.float16,
            load_in_8bit=True,
        ).eval()

@ -331,14 +331,14 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
        expectations = Expectations(
            {
                ("cuda", 7): [
-                    [15.5000, 8.1250, 14.9375],
-                    [9.0625, 17.1250, 10.6875],
-                    [15.9375, 12.1875, 20.2500],
+                    [15.0938, 8.3203, 15.0391],
+                    [9.6328, 16.9062, 10.5312],
+                    [15.6562, 12.2656, 20.2969],
                ],
                ("cuda", 8): [
-                    [15.1250, 8.6875, 15.0625],
-                    [9.2500, 17.2500, 10.3750],
-                    [15.9375, 12.3750, 20.2500],
+                    [15.0703, 8.7422, 15.0312],
+                    [9.5078, 16.8906, 10.6250],
+                    [15.6484, 12.3984, 20.4688],
                ],
            }
        )
--- a/tests/models/internvl/test_modeling_internvl.py
+++ b/tests/models/internvl/test_modeling_internvl.py
@ -292,7 +292,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def test_qwen2_small_model_integration_generate(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        image = Image.open(requests.get(url, stream=True).raw)
@ -300,19 +300,20 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        prompt = (
            "<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
        )
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
            decoded_output = processor.decode(
                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
            )
-        expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
+        expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
+
        self.assertEqual(decoded_output, expected_output)

    def test_qwen2_small_model_integration_forward(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        image = Image.open(requests.get(url, stream=True).raw)
@ -320,7 +321,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        prompt = (
            "<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
        )
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)

        # Forward
        with torch.inference_mode():
@ -329,9 +330,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        actual_logits = output.logits[0, -1, :5].cpu()
        expected_logits_all = Expectations(
            {
-                ("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.bfloat16),
-                ("cuda", 7): torch.tensor([11.9375, 14.7500, 14.4375, 10.8125,  7.0938], dtype=torch.bfloat16),
-                ("cuda", 8): torch.tensor([11.8750, 14.8125, 14.3125, 10.8125,  6.9375], dtype=torch.bfloat16),
+                ("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.float16),
+                ("cuda", 7): torch.tensor([11.9531, 14.7031, 14.2734, 10.6562,  6.9219], dtype=torch.float16),
+                ("cuda", 8): torch.tensor([11.9609, 14.7188, 14.2734, 10.6484,  6.9141], dtype=torch.float16),
            }
        )  # fmt: skip
        expected_logits = expected_logits_all.get_expectation()
@ -347,10 +348,10 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def test_qwen2_small_model_integration_generate_text_only(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
-        inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
            decoded_output = processor.decode(
@ -360,8 +361,8 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
-                ("cuda", 7): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
-                ("cuda", 8): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light begins.",
+                ("cuda", 7): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
+                ("cuda", 8): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@ -371,7 +372,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def test_qwen2_small_model_integration_generate_chat_template(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        messages = [
            {
@ -385,20 +386,21 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):

        inputs = processor.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
-        ).to(torch_device, dtype=torch.bfloat16)
+        ).to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
            decoded_output = processor.decode(
                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
            )
-        expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
+        expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
+
        self.assertEqual(decoded_output, expected_output)

    @require_deterministic_for_xpu
    def test_qwen2_small_model_integration_batched_generate(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        # Prepare inputs
        prompt = [
@ -409,7 +411,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

        inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
-            torch_device, dtype=torch.bfloat16
+            torch_device, dtype=torch.float16
        )

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@ -417,6 +419,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        # Check first output
        decoded_output = processor.decode(output[0], skip_special_tokens=True)
        expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace."  # fmt: skip
+
        self.assertEqual(
            decoded_output,
            expected_output,
@ -428,7 +431,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate"',
-                ("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Arch,"',
+                ("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate of',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@ -442,7 +445,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def test_qwen2_small_model_integration_batched_generate_multi_image(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        # Prepare inputs
        prompt = [
@ -466,7 +469,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        )

        inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
-            torch_device, dtype=torch.bfloat16
+            torch_device, dtype=torch.float16
        )

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@ -548,7 +551,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def test_qwen2_small_model_integration_interleaved_images_videos(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
+            self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
        )
        messages = [
            [
@ -600,7 +603,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
            return_tensors="pt",
            padding=True,
            num_frames=8,
-        ).to(torch_device, dtype=torch.bfloat16)
+        ).to(torch_device, dtype=torch.float16)

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)

@ -609,10 +612,11 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n   - The Statue of Liberty is prominently featured on an",
-                ("cuda", 7): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n   - The Statue of Liberty is prominently featured on an",
+                ("cuda", 7): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n   - The Statue of Liberty is prominently featured on an',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
+
        self.assertEqual(
            decoded_output,
            expected_output,
@ -623,7 +627,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
-                ("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
+                ("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@ -635,7 +639,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):

        # Check third output
        decoded_output = processor.decode(output[2], skip_special_tokens=True)
-        expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace."  # fmt: skip
+        expected_output = (
+            "user\n\nWrite a haiku for this image\nassistant\nSilky lake,  \nWooden pier,  \nNature's peace."
+        )
        self.assertEqual(
            decoded_output,
            expected_output,
@ -657,7 +663,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_generate(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        image = Image.open(requests.get(url, stream=True).raw)
@ -665,7 +671,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        prompt = (
            "<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
        )
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
            decoded_output = processor.decode(
@ -677,7 +683,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_forward(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        image = Image.open(requests.get(url, stream=True).raw)
@ -685,7 +691,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        prompt = (
            "<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
        )
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)

        # Forward
        with torch.inference_mode():
@ -695,12 +701,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):

        expected_logits_all = Expectations(
            {
-                ("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
-                ("cuda", 7): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
-                ("cuda", 8): torch.tensor([-9.8750,  -0.5117,   1.4297, -10.3750, -10.3750], dtype=torch.bfloat16),
+                ("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.float16),
+                ("cuda", 7): torch.tensor([-9.8750,  -0.4861,   1.4648, -10.3359, -10.3359], dtype=torch.float16),
+                ("cuda", 8): torch.tensor([-9.8906,  -0.4995,   1.4473, -10.3359, -10.3438], dtype=torch.float16),
            }
        )  # fmt: skip
-        expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.bfloat16)
+        expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.float16)

        # The original implementation and the transformers implementation do not match exactly, hence the higher tolerance.
        # The difference is likely due to the different implementations of the attention mechanism (different order of operations)
@ -716,22 +722,30 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_generate_text_only(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
-        inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
+        inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
            decoded_output = processor.decode(
                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
            )
-        expected_output = "Autumn leaves fall,\nNature's breath, a season's sigh,\nSilent woods awake."
+
+        expected_outputs = Expectations(
+            {
+                ("cuda", 7): "Autumn leaves fall,\nNature's breath, a gentle sigh,\nSilent whispers.",
+                ("cuda", 8): "Autumn leaves fall,\nNature's breath, a silent sigh,\nWinter's chill approaches.",
+            }
+        )
+        expected_output = expected_outputs.get_expectation()
+
        self.assertEqual(decoded_output, expected_output)

    def test_llama_small_model_integration_generate_chat_template(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        messages = [
            {
@ -745,7 +759,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):

        inputs = processor.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
-        ).to(torch_device, dtype=torch.bfloat16)
+        ).to(torch_device, dtype=torch.float16)
        with torch.no_grad():
            generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
            decoded_output = processor.decode(
@ -757,7 +771,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_batched_generate(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        # Prepare inputs
        prompt = [
@ -768,7 +782,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

        inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
-            torch_device, dtype=torch.bfloat16
+            torch_device, dtype=torch.float16
        )

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@ -778,11 +792,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
-                ("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
-                ("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nA wooden path leads to the sea,\nPeaceful, still waters.",
+                ("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
+                ("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
+
        self.assertEqual(
            decoded_output,
            expected_output,
@ -791,7 +806,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):

        # Check second output
        decoded_output = processor.decode(output[1], skip_special_tokens=True)
-        expected_output = 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters'  # fmt: skip
+        expected_output = "user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters"
        self.assertEqual(
            decoded_output,
            expected_output,
@ -801,7 +816,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_batched_generate_multi_image(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+            self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
        )
        # Prepare inputs
        prompt = [
@ -825,7 +840,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        )

        inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
-            torch_device, dtype=torch.bfloat16
+            torch_device, dtype=torch.float16
        )

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@ -833,7 +848,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        # Check first output
        decoded_output = processor.decode(output[0], skip_special_tokens=True)
        # Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
-        expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace."  # fmt: skip
+        expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors."
+
        self.assertEqual(
            decoded_output,
            expected_output,
@ -842,7 +858,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):

        # Check second output
        decoded_output = processor.decode(output[1], skip_special_tokens=True)
-        expected_output = 'user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences'  # fmt: skip
+        expected_output = "user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences"
        self.assertEqual(
            decoded_output,
            expected_output,
@ -893,7 +909,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
    def test_llama_small_model_integration_interleaved_images_videos(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
-            self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
+            self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
        )
        messages = [
            [
@ -945,7 +961,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
            return_tensors="pt",
            padding=True,
            num_frames=8,
-        ).to(torch_device, dtype=torch.bfloat16)
+        ).to(torch_device, dtype=torch.float16)

        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)

@ -954,8 +970,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
-                ("cuda", 7): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
-                ("cuda", 8): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences",
+                ("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
+                ("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@ -970,8 +986,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
-                ("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
-                ("cuda", 8): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
+                ("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
+                ("cuda", 8): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
@ -986,8 +1002,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
        expected_outputs = Expectations(
            {
                ("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
-                ("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
-                ("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
+                ("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
+                ("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
            }
        )  # fmt: skip
        expected_output = expected_outputs.get_expectation()
--- a/tests/models/qwen3/test_modeling_qwen3.py
+++ b/tests/models/qwen3/test_modeling_qwen3.py
@ -248,7 +248,7 @@ class Qwen3IntegrationTest(unittest.TestCase):
        tokenizer = AutoTokenizer.from_pretrained(qwen_model, pad_token="</s>", padding_side="right")
        if is_torch_greater_or_equal("2.7.0"):
            strict = False  # Due to https://github.com/pytorch/pytorch/issues/150994
-            EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unsalted, unsweetened, and unflavored."]
+            EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unflavoured, and unadulterated."]
        else:
            strict = True
            EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unflavoured, and unadulterated. It is"]
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@ -422,11 +422,13 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
        output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)

-        EXPECTED_OUTPUT_STR = (
-            "Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place"
-            " looks like a"
-        )
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
+        EXPECTED_OUTPUT_STRS = [
+            # torch 2.6
+            "Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place looks like a",
+            # torch 2.7
+            "Today is a nice day and the sun is shining. A nice day with warm rainy and windy weather today.",
+        ]
+        self.assertIn(output_str, EXPECTED_OUTPUT_STRS)

    @require_torch_accelerator
    @require_torch_fp16