Update some tests for torch 2.7.1 (#38701)

* fix 1

* fix 2

* fix 3

* fix 4

* fp16

* break

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar 2025-06-10 11:46:52 +02:00 committed by GitHub
parent afdb821318
commit 04cdf83244
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 88 additions and 70 deletions

View File

@ -463,11 +463,11 @@ class ChameleonIntegrationTest(unittest.TestCase):
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
],
("cuda", 7): [
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue line extending across the center of the image. The line is labeled "390 light years" and is accompanied by a small black and',
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot representing the position of the star Alpha Centauri. Alpha Centauri is the brightest star in the constellation Centaurus and is located',
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
],
("cuda", 8): [
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in',
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot representing the position of the star Alpha Centauri. Alpha Centauri is the brightest star in the constellation Centaurus and is located',
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
],
}

View File

@ -299,7 +299,7 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
"""
model = ColQwen2ForRetrieval.from_pretrained(
self.model_name,
torch_dtype=torch.bfloat16,
torch_dtype=torch.float16,
load_in_8bit=True,
).eval()
@ -331,14 +331,14 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
expectations = Expectations(
{
("cuda", 7): [
[15.5000, 8.1250, 14.9375],
[9.0625, 17.1250, 10.6875],
[15.9375, 12.1875, 20.2500],
[15.0938, 8.3203, 15.0391],
[9.6328, 16.9062, 10.5312],
[15.6562, 12.2656, 20.2969],
],
("cuda", 8): [
[15.1250, 8.6875, 15.0625],
[9.2500, 17.2500, 10.3750],
[15.9375, 12.3750, 20.2500],
[15.0703, 8.7422, 15.0312],
[9.5078, 16.8906, 10.6250],
[15.6484, 12.3984, 20.4688],
],
}
)

View File

@ -292,7 +292,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
def test_qwen2_small_model_integration_generate(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
@ -300,19 +300,20 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
prompt = (
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
decoded_output = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
)
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
self.assertEqual(decoded_output, expected_output)
def test_qwen2_small_model_integration_forward(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
@ -320,7 +321,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
prompt = (
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
# Forward
with torch.inference_mode():
@ -329,9 +330,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
actual_logits = output.logits[0, -1, :5].cpu()
expected_logits_all = Expectations(
{
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.bfloat16),
("cuda", 7): torch.tensor([11.9375, 14.7500, 14.4375, 10.8125, 7.0938], dtype=torch.bfloat16),
("cuda", 8): torch.tensor([11.8750, 14.8125, 14.3125, 10.8125, 6.9375], dtype=torch.bfloat16),
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.float16),
("cuda", 7): torch.tensor([11.9531, 14.7031, 14.2734, 10.6562, 6.9219], dtype=torch.float16),
("cuda", 8): torch.tensor([11.9609, 14.7188, 14.2734, 10.6484, 6.9141], dtype=torch.float16),
}
) # fmt: skip
expected_logits = expected_logits_all.get_expectation()
@ -347,10 +348,10 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
def test_qwen2_small_model_integration_generate_text_only(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
decoded_output = processor.decode(
@ -360,8 +361,8 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
("cuda", 7): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
("cuda", 8): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light begins.",
("cuda", 7): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
("cuda", 8): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
@ -371,7 +372,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
def test_qwen2_small_model_integration_generate_chat_template(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
messages = [
{
@ -385,20 +386,21 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(torch_device, dtype=torch.bfloat16)
).to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
decoded_output = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
)
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
self.assertEqual(decoded_output, expected_output)
@require_deterministic_for_xpu
def test_qwen2_small_model_integration_batched_generate(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
# Prepare inputs
prompt = [
@ -409,7 +411,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
torch_device, dtype=torch.bfloat16
torch_device, dtype=torch.float16
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@ -417,6 +419,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
# Check first output
decoded_output = processor.decode(output[0], skip_special_tokens=True)
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
self.assertEqual(
decoded_output,
expected_output,
@ -428,7 +431,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate"',
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Arch,"',
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate of',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
@ -442,7 +445,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
def test_qwen2_small_model_integration_batched_generate_multi_image(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
# Prepare inputs
prompt = [
@ -466,7 +469,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
)
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
torch_device, dtype=torch.bfloat16
torch_device, dtype=torch.float16
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@ -548,7 +551,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
def test_qwen2_small_model_integration_interleaved_images_videos(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
)
messages = [
[
@ -600,7 +603,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
return_tensors="pt",
padding=True,
num_frames=8,
).to(torch_device, dtype=torch.bfloat16)
).to(torch_device, dtype=torch.float16)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@ -609,10 +612,11 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
("cuda", 7): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
("cuda", 7): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
@ -623,7 +627,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
@ -635,7 +639,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
# Check third output
decoded_output = processor.decode(output[2], skip_special_tokens=True)
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
expected_output = (
"user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace."
)
self.assertEqual(
decoded_output,
expected_output,
@ -657,7 +663,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_generate(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
@ -665,7 +671,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
prompt = (
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
decoded_output = processor.decode(
@ -677,7 +683,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_forward(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
@ -685,7 +691,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
prompt = (
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
# Forward
with torch.inference_mode():
@ -695,12 +701,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
expected_logits_all = Expectations(
{
("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
("cuda", 7): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
("cuda", 8): torch.tensor([-9.8750, -0.5117, 1.4297, -10.3750, -10.3750], dtype=torch.bfloat16),
("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.float16),
("cuda", 7): torch.tensor([-9.8750, -0.4861, 1.4648, -10.3359, -10.3359], dtype=torch.float16),
("cuda", 8): torch.tensor([-9.8906, -0.4995, 1.4473, -10.3359, -10.3438], dtype=torch.float16),
}
) # fmt: skip
expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.bfloat16)
expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.float16)
# The original implementation and the transformers implementation do not match exactly, hence the higher tolerance.
# The difference is likely due to the different implementations of the attention mechanism (different order of operations)
@ -716,22 +722,30 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_generate_text_only(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
decoded_output = processor.decode(
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
)
expected_output = "Autumn leaves fall,\nNature's breath, a season's sigh,\nSilent woods awake."
expected_outputs = Expectations(
{
("cuda", 7): "Autumn leaves fall,\nNature's breath, a gentle sigh,\nSilent whispers.",
("cuda", 8): "Autumn leaves fall,\nNature's breath, a silent sigh,\nWinter's chill approaches.",
}
)
expected_output = expected_outputs.get_expectation()
self.assertEqual(decoded_output, expected_output)
def test_llama_small_model_integration_generate_chat_template(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
messages = [
{
@ -745,7 +759,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(torch_device, dtype=torch.bfloat16)
).to(torch_device, dtype=torch.float16)
with torch.no_grad():
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
decoded_output = processor.decode(
@ -757,7 +771,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_batched_generate(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
# Prepare inputs
prompt = [
@ -768,7 +782,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
torch_device, dtype=torch.bfloat16
torch_device, dtype=torch.float16
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@ -778,11 +792,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nA wooden path leads to the sea,\nPeaceful, still waters.",
("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
self.assertEqual(
decoded_output,
expected_output,
@ -791,7 +806,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
# Check second output
decoded_output = processor.decode(output[1], skip_special_tokens=True)
expected_output = 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters' # fmt: skip
expected_output = "user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters"
self.assertEqual(
decoded_output,
expected_output,
@ -801,7 +816,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_batched_generate_multi_image(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
)
# Prepare inputs
prompt = [
@ -825,7 +840,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
)
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
torch_device, dtype=torch.bfloat16
torch_device, dtype=torch.float16
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@ -833,7 +848,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
# Check first output
decoded_output = processor.decode(output[0], skip_special_tokens=True)
# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace." # fmt: skip
expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors."
self.assertEqual(
decoded_output,
expected_output,
@ -842,7 +858,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
# Check second output
decoded_output = processor.decode(output[1], skip_special_tokens=True)
expected_output = 'user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences' # fmt: skip
expected_output = "user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences"
self.assertEqual(
decoded_output,
expected_output,
@ -893,7 +909,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
def test_llama_small_model_integration_interleaved_images_videos(self):
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
model = InternVLForConditionalGeneration.from_pretrained(
self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
)
messages = [
[
@ -945,7 +961,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
return_tensors="pt",
padding=True,
num_frames=8,
).to(torch_device, dtype=torch.bfloat16)
).to(torch_device, dtype=torch.float16)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
@ -954,8 +970,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
("cuda", 7): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
("cuda", 8): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences",
("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
@ -970,8 +986,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
("cuda", 8): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
("cuda", 8): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()
@ -986,8 +1002,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
expected_outputs = Expectations(
{
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
}
) # fmt: skip
expected_output = expected_outputs.get_expectation()

View File

@ -248,7 +248,7 @@ class Qwen3IntegrationTest(unittest.TestCase):
tokenizer = AutoTokenizer.from_pretrained(qwen_model, pad_token="</s>", padding_side="right")
if is_torch_greater_or_equal("2.7.0"):
strict = False # Due to https://github.com/pytorch/pytorch/issues/150994
EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unsalted, unsweetened, and unflavored."]
EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unflavoured, and unadulterated."]
else:
strict = True
EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unflavoured, and unadulterated. It is"]

View File

@ -422,11 +422,13 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
EXPECTED_OUTPUT_STR = (
"Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place"
" looks like a"
)
self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
EXPECTED_OUTPUT_STRS = [
# torch 2.6
"Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place looks like a",
# torch 2.7
"Today is a nice day and the sun is shining. A nice day with warm rainy and windy weather today.",
]
self.assertIn(output_str, EXPECTED_OUTPUT_STRS)
@require_torch_accelerator
@require_torch_fp16