mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
Update some tests for torch 2.7.1 (#38701)
* fix 1 * fix 2 * fix 3 * fix 4 * fp16 * break * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
afdb821318
commit
04cdf83244
@ -463,11 +463,11 @@ class ChameleonIntegrationTest(unittest.TestCase):
|
||||
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
|
||||
],
|
||||
("cuda", 7): [
|
||||
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue line extending across the center of the image. The line is labeled "390 light years" and is accompanied by a small black and',
|
||||
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot representing the position of the star Alpha Centauri. Alpha Centauri is the brightest star in the constellation Centaurus and is located',
|
||||
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
|
||||
],
|
||||
("cuda", 8): [
|
||||
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in',
|
||||
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot representing the position of the star Alpha Centauri. Alpha Centauri is the brightest star in the constellation Centaurus and is located',
|
||||
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
|
||||
],
|
||||
}
|
||||
|
@ -299,7 +299,7 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
|
||||
"""
|
||||
model = ColQwen2ForRetrieval.from_pretrained(
|
||||
self.model_name,
|
||||
torch_dtype=torch.bfloat16,
|
||||
torch_dtype=torch.float16,
|
||||
load_in_8bit=True,
|
||||
).eval()
|
||||
|
||||
@ -331,14 +331,14 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
|
||||
expectations = Expectations(
|
||||
{
|
||||
("cuda", 7): [
|
||||
[15.5000, 8.1250, 14.9375],
|
||||
[9.0625, 17.1250, 10.6875],
|
||||
[15.9375, 12.1875, 20.2500],
|
||||
[15.0938, 8.3203, 15.0391],
|
||||
[9.6328, 16.9062, 10.5312],
|
||||
[15.6562, 12.2656, 20.2969],
|
||||
],
|
||||
("cuda", 8): [
|
||||
[15.1250, 8.6875, 15.0625],
|
||||
[9.2500, 17.2500, 10.3750],
|
||||
[15.9375, 12.3750, 20.2500],
|
||||
[15.0703, 8.7422, 15.0312],
|
||||
[9.5078, 16.8906, 10.6250],
|
||||
[15.6484, 12.3984, 20.4688],
|
||||
],
|
||||
}
|
||||
)
|
||||
|
@ -292,7 +292,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
def test_qwen2_small_model_integration_generate(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
@ -300,19 +300,20 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
prompt = (
|
||||
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
with torch.no_grad():
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
decoded_output = processor.decode(
|
||||
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
||||
)
|
||||
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
|
||||
expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
|
||||
|
||||
self.assertEqual(decoded_output, expected_output)
|
||||
|
||||
def test_qwen2_small_model_integration_forward(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
@ -320,7 +321,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
prompt = (
|
||||
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
|
||||
# Forward
|
||||
with torch.inference_mode():
|
||||
@ -329,9 +330,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
actual_logits = output.logits[0, -1, :5].cpu()
|
||||
expected_logits_all = Expectations(
|
||||
{
|
||||
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.bfloat16),
|
||||
("cuda", 7): torch.tensor([11.9375, 14.7500, 14.4375, 10.8125, 7.0938], dtype=torch.bfloat16),
|
||||
("cuda", 8): torch.tensor([11.8750, 14.8125, 14.3125, 10.8125, 6.9375], dtype=torch.bfloat16),
|
||||
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.float16),
|
||||
("cuda", 7): torch.tensor([11.9531, 14.7031, 14.2734, 10.6562, 6.9219], dtype=torch.float16),
|
||||
("cuda", 8): torch.tensor([11.9609, 14.7188, 14.2734, 10.6484, 6.9141], dtype=torch.float16),
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_logits = expected_logits_all.get_expectation()
|
||||
@ -347,10 +348,10 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
def test_qwen2_small_model_integration_generate_text_only(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
|
||||
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
||||
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
with torch.no_grad():
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
||||
decoded_output = processor.decode(
|
||||
@ -360,8 +361,8 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
|
||||
("cuda", 7): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
|
||||
("cuda", 8): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light begins.",
|
||||
("cuda", 7): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
|
||||
("cuda", 8): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
@ -371,7 +372,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
def test_qwen2_small_model_integration_generate_chat_template(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
messages = [
|
||||
{
|
||||
@ -385,20 +386,21 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
||||
).to(torch_device, dtype=torch.bfloat16)
|
||||
).to(torch_device, dtype=torch.float16)
|
||||
with torch.no_grad():
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
decoded_output = processor.decode(
|
||||
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
||||
)
|
||||
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
|
||||
expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
|
||||
|
||||
self.assertEqual(decoded_output, expected_output)
|
||||
|
||||
@require_deterministic_for_xpu
|
||||
def test_qwen2_small_model_integration_batched_generate(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
# Prepare inputs
|
||||
prompt = [
|
||||
@ -409,7 +411,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
|
||||
|
||||
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
|
||||
torch_device, dtype=torch.bfloat16
|
||||
torch_device, dtype=torch.float16
|
||||
)
|
||||
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||
@ -417,6 +419,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
# Check first output
|
||||
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
||||
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
|
||||
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@ -428,7 +431,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate"',
|
||||
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Arch,"',
|
||||
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate of',
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
@ -442,7 +445,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
def test_qwen2_small_model_integration_batched_generate_multi_image(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
# Prepare inputs
|
||||
prompt = [
|
||||
@ -466,7 +469,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
|
||||
torch_device, dtype=torch.bfloat16
|
||||
torch_device, dtype=torch.float16
|
||||
)
|
||||
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||
@ -548,7 +551,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
def test_qwen2_small_model_integration_interleaved_images_videos(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
|
||||
self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
|
||||
)
|
||||
messages = [
|
||||
[
|
||||
@ -600,7 +603,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
num_frames=8,
|
||||
).to(torch_device, dtype=torch.bfloat16)
|
||||
).to(torch_device, dtype=torch.float16)
|
||||
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||
|
||||
@ -609,10 +612,11 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
|
||||
("cuda", 7): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
|
||||
("cuda", 7): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an',
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@ -623,7 +627,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
|
||||
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
|
||||
("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
@ -635,7 +639,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
||||
|
||||
# Check third output
|
||||
decoded_output = processor.decode(output[2], skip_special_tokens=True)
|
||||
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
|
||||
expected_output = (
|
||||
"user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace."
|
||||
)
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@ -657,7 +663,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
def test_llama_small_model_integration_generate(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
@ -665,7 +671,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
prompt = (
|
||||
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
with torch.no_grad():
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
decoded_output = processor.decode(
|
||||
@ -677,7 +683,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
def test_llama_small_model_integration_forward(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||
image = Image.open(requests.get(url, stream=True).raw)
|
||||
@ -685,7 +691,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
prompt = (
|
||||
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
|
||||
# Forward
|
||||
with torch.inference_mode():
|
||||
@ -695,12 +701,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
|
||||
expected_logits_all = Expectations(
|
||||
{
|
||||
("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
|
||||
("cuda", 7): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
|
||||
("cuda", 8): torch.tensor([-9.8750, -0.5117, 1.4297, -10.3750, -10.3750], dtype=torch.bfloat16),
|
||||
("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.float16),
|
||||
("cuda", 7): torch.tensor([-9.8750, -0.4861, 1.4648, -10.3359, -10.3359], dtype=torch.float16),
|
||||
("cuda", 8): torch.tensor([-9.8906, -0.4995, 1.4473, -10.3359, -10.3438], dtype=torch.float16),
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.bfloat16)
|
||||
expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.float16)
|
||||
|
||||
# The original implementation and the transformers implementation do not match exactly, hence the higher tolerance.
|
||||
# The difference is likely due to the different implementations of the attention mechanism (different order of operations)
|
||||
@ -716,22 +722,30 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
def test_llama_small_model_integration_generate_text_only(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
|
||||
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
||||
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
with torch.no_grad():
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
||||
decoded_output = processor.decode(
|
||||
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
||||
)
|
||||
expected_output = "Autumn leaves fall,\nNature's breath, a season's sigh,\nSilent woods awake."
|
||||
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("cuda", 7): "Autumn leaves fall,\nNature's breath, a gentle sigh,\nSilent whispers.",
|
||||
("cuda", 8): "Autumn leaves fall,\nNature's breath, a silent sigh,\nWinter's chill approaches.",
|
||||
}
|
||||
)
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
|
||||
self.assertEqual(decoded_output, expected_output)
|
||||
|
||||
def test_llama_small_model_integration_generate_chat_template(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
messages = [
|
||||
{
|
||||
@ -745,7 +759,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
|
||||
inputs = processor.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
||||
).to(torch_device, dtype=torch.bfloat16)
|
||||
).to(torch_device, dtype=torch.float16)
|
||||
with torch.no_grad():
|
||||
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||
decoded_output = processor.decode(
|
||||
@ -757,7 +771,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
def test_llama_small_model_integration_batched_generate(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
# Prepare inputs
|
||||
prompt = [
|
||||
@ -768,7 +782,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
|
||||
|
||||
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
|
||||
torch_device, dtype=torch.bfloat16
|
||||
torch_device, dtype=torch.float16
|
||||
)
|
||||
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||
@ -778,11 +792,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
|
||||
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
|
||||
("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nA wooden path leads to the sea,\nPeaceful, still waters.",
|
||||
("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
||||
("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@ -791,7 +806,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
|
||||
# Check second output
|
||||
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
||||
expected_output = 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters' # fmt: skip
|
||||
expected_output = "user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters"
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@ -801,7 +816,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
def test_llama_small_model_integration_batched_generate_multi_image(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||
)
|
||||
# Prepare inputs
|
||||
prompt = [
|
||||
@ -825,7 +840,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
|
||||
torch_device, dtype=torch.bfloat16
|
||||
torch_device, dtype=torch.float16
|
||||
)
|
||||
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||
@ -833,7 +848,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
# Check first output
|
||||
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
||||
# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
|
||||
expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace." # fmt: skip
|
||||
expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors."
|
||||
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@ -842,7 +858,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
|
||||
# Check second output
|
||||
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
||||
expected_output = 'user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences' # fmt: skip
|
||||
expected_output = "user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences"
|
||||
self.assertEqual(
|
||||
decoded_output,
|
||||
expected_output,
|
||||
@ -893,7 +909,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
def test_llama_small_model_integration_interleaved_images_videos(self):
|
||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||
model = InternVLForConditionalGeneration.from_pretrained(
|
||||
self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
|
||||
self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
|
||||
)
|
||||
messages = [
|
||||
[
|
||||
@ -945,7 +961,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
num_frames=8,
|
||||
).to(torch_device, dtype=torch.bfloat16)
|
||||
).to(torch_device, dtype=torch.float16)
|
||||
|
||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||
|
||||
@ -954,8 +970,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
|
||||
("cuda", 7): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
|
||||
("cuda", 8): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences",
|
||||
("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
|
||||
("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
@ -970,8 +986,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
||||
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
||||
("cuda", 8): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
||||
("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
|
||||
("cuda", 8): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
@ -986,8 +1002,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
||||
expected_outputs = Expectations(
|
||||
{
|
||||
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
||||
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
||||
("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
||||
("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
||||
("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
||||
}
|
||||
) # fmt: skip
|
||||
expected_output = expected_outputs.get_expectation()
|
||||
|
@ -248,7 +248,7 @@ class Qwen3IntegrationTest(unittest.TestCase):
|
||||
tokenizer = AutoTokenizer.from_pretrained(qwen_model, pad_token="</s>", padding_side="right")
|
||||
if is_torch_greater_or_equal("2.7.0"):
|
||||
strict = False # Due to https://github.com/pytorch/pytorch/issues/150994
|
||||
EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unsalted, unsweetened, and unflavored."]
|
||||
EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unflavoured, and unadulterated."]
|
||||
else:
|
||||
strict = True
|
||||
EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unflavoured, and unadulterated. It is"]
|
||||
|
@ -422,11 +422,13 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
|
||||
output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
|
||||
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
||||
|
||||
EXPECTED_OUTPUT_STR = (
|
||||
"Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place"
|
||||
" looks like a"
|
||||
)
|
||||
self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
|
||||
EXPECTED_OUTPUT_STRS = [
|
||||
# torch 2.6
|
||||
"Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place looks like a",
|
||||
# torch 2.7
|
||||
"Today is a nice day and the sun is shining. A nice day with warm rainy and windy weather today.",
|
||||
]
|
||||
self.assertIn(output_str, EXPECTED_OUTPUT_STRS)
|
||||
|
||||
@require_torch_accelerator
|
||||
@require_torch_fp16
|
||||
|
Loading…
Reference in New Issue
Block a user