mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 13:20:12 +06:00
Update some tests for torch 2.7.1 (#38701)
* fix 1 * fix 2 * fix 3 * fix 4 * fp16 * break * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
afdb821318
commit
04cdf83244
@ -463,11 +463,11 @@ class ChameleonIntegrationTest(unittest.TestCase):
|
|||||||
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
|
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
|
||||||
],
|
],
|
||||||
("cuda", 7): [
|
("cuda", 7): [
|
||||||
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue line extending across the center of the image. The line is labeled "390 light years" and is accompanied by a small black and',
|
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot representing the position of the star Alpha Centauri. Alpha Centauri is the brightest star in the constellation Centaurus and is located',
|
||||||
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
|
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
|
||||||
],
|
],
|
||||||
("cuda", 8): [
|
("cuda", 8): [
|
||||||
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot in the center representing the star Alpha Centauri. The star map is a representation of the night sky, showing the positions of stars in',
|
'Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue dot representing the position of the star Alpha Centauri. Alpha Centauri is the brightest star in the constellation Centaurus and is located',
|
||||||
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
|
'What constellation is this image showing?The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.The image shows the constellation of Orion.',
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
@ -299,7 +299,7 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
|
|||||||
"""
|
"""
|
||||||
model = ColQwen2ForRetrieval.from_pretrained(
|
model = ColQwen2ForRetrieval.from_pretrained(
|
||||||
self.model_name,
|
self.model_name,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.float16,
|
||||||
load_in_8bit=True,
|
load_in_8bit=True,
|
||||||
).eval()
|
).eval()
|
||||||
|
|
||||||
@ -331,14 +331,14 @@ class ColQwen2ModelIntegrationTest(unittest.TestCase):
|
|||||||
expectations = Expectations(
|
expectations = Expectations(
|
||||||
{
|
{
|
||||||
("cuda", 7): [
|
("cuda", 7): [
|
||||||
[15.5000, 8.1250, 14.9375],
|
[15.0938, 8.3203, 15.0391],
|
||||||
[9.0625, 17.1250, 10.6875],
|
[9.6328, 16.9062, 10.5312],
|
||||||
[15.9375, 12.1875, 20.2500],
|
[15.6562, 12.2656, 20.2969],
|
||||||
],
|
],
|
||||||
("cuda", 8): [
|
("cuda", 8): [
|
||||||
[15.1250, 8.6875, 15.0625],
|
[15.0703, 8.7422, 15.0312],
|
||||||
[9.2500, 17.2500, 10.3750],
|
[9.5078, 16.8906, 10.6250],
|
||||||
[15.9375, 12.3750, 20.2500],
|
[15.6484, 12.3984, 20.4688],
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -292,7 +292,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
def test_qwen2_small_model_integration_generate(self):
|
def test_qwen2_small_model_integration_generate(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
image = Image.open(requests.get(url, stream=True).raw)
|
image = Image.open(requests.get(url, stream=True).raw)
|
||||||
@ -300,19 +300,20 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
prompt = (
|
prompt = (
|
||||||
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
||||||
)
|
)
|
||||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||||
decoded_output = processor.decode(
|
decoded_output = processor.decode(
|
||||||
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
||||||
)
|
)
|
||||||
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
|
expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
|
||||||
|
|
||||||
self.assertEqual(decoded_output, expected_output)
|
self.assertEqual(decoded_output, expected_output)
|
||||||
|
|
||||||
def test_qwen2_small_model_integration_forward(self):
|
def test_qwen2_small_model_integration_forward(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
image = Image.open(requests.get(url, stream=True).raw)
|
image = Image.open(requests.get(url, stream=True).raw)
|
||||||
@ -320,7 +321,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
prompt = (
|
prompt = (
|
||||||
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
||||||
)
|
)
|
||||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||||
|
|
||||||
# Forward
|
# Forward
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
@ -329,9 +330,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
actual_logits = output.logits[0, -1, :5].cpu()
|
actual_logits = output.logits[0, -1, :5].cpu()
|
||||||
expected_logits_all = Expectations(
|
expected_logits_all = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.bfloat16),
|
("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.float16),
|
||||||
("cuda", 7): torch.tensor([11.9375, 14.7500, 14.4375, 10.8125, 7.0938], dtype=torch.bfloat16),
|
("cuda", 7): torch.tensor([11.9531, 14.7031, 14.2734, 10.6562, 6.9219], dtype=torch.float16),
|
||||||
("cuda", 8): torch.tensor([11.8750, 14.8125, 14.3125, 10.8125, 6.9375], dtype=torch.bfloat16),
|
("cuda", 8): torch.tensor([11.9609, 14.7188, 14.2734, 10.6484, 6.9141], dtype=torch.float16),
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_logits = expected_logits_all.get_expectation()
|
expected_logits = expected_logits_all.get_expectation()
|
||||||
@ -347,10 +348,10 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
def test_qwen2_small_model_integration_generate_text_only(self):
|
def test_qwen2_small_model_integration_generate_text_only(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
|
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
|
||||||
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
||||||
decoded_output = processor.decode(
|
decoded_output = processor.decode(
|
||||||
@ -360,8 +361,8 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
|
("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
|
||||||
("cuda", 7): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
|
("cuda", 7): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
|
||||||
("cuda", 8): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light begins.",
|
("cuda", 8): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
@ -371,7 +372,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
def test_qwen2_small_model_integration_generate_chat_template(self):
|
def test_qwen2_small_model_integration_generate_chat_template(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
@ -385,20 +386,21 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
inputs = processor.apply_chat_template(
|
inputs = processor.apply_chat_template(
|
||||||
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
||||||
).to(torch_device, dtype=torch.bfloat16)
|
).to(torch_device, dtype=torch.float16)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||||
decoded_output = processor.decode(
|
decoded_output = processor.decode(
|
||||||
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
||||||
)
|
)
|
||||||
expected_output = "The image shows two cats lying on a pink blanket. The cat on the left is a tabby"
|
expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."
|
||||||
|
|
||||||
self.assertEqual(decoded_output, expected_output)
|
self.assertEqual(decoded_output, expected_output)
|
||||||
|
|
||||||
@require_deterministic_for_xpu
|
@require_deterministic_for_xpu
|
||||||
def test_qwen2_small_model_integration_batched_generate(self):
|
def test_qwen2_small_model_integration_batched_generate(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
# Prepare inputs
|
# Prepare inputs
|
||||||
prompt = [
|
prompt = [
|
||||||
@ -409,7 +411,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
|
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
|
||||||
|
|
||||||
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
|
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
|
||||||
torch_device, dtype=torch.bfloat16
|
torch_device, dtype=torch.float16
|
||||||
)
|
)
|
||||||
|
|
||||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||||
@ -417,6 +419,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
# Check first output
|
# Check first output
|
||||||
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
||||||
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
|
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded_output,
|
decoded_output,
|
||||||
expected_output,
|
expected_output,
|
||||||
@ -428,7 +431,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate"',
|
("xpu", 3): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate"',
|
||||||
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Arch,"',
|
("cuda", 7): 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese archway, known as a "Chinese Gate" or "Chinese Gate of',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
@ -442,7 +445,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
def test_qwen2_small_model_integration_batched_generate_multi_image(self):
|
def test_qwen2_small_model_integration_batched_generate_multi_image(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
# Prepare inputs
|
# Prepare inputs
|
||||||
prompt = [
|
prompt = [
|
||||||
@ -466,7 +469,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
|
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
|
||||||
torch_device, dtype=torch.bfloat16
|
torch_device, dtype=torch.float16
|
||||||
)
|
)
|
||||||
|
|
||||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||||
@ -548,7 +551,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
def test_qwen2_small_model_integration_interleaved_images_videos(self):
|
def test_qwen2_small_model_integration_interleaved_images_videos(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
|
self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
|
||||||
)
|
)
|
||||||
messages = [
|
messages = [
|
||||||
[
|
[
|
||||||
@ -600,7 +603,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
padding=True,
|
padding=True,
|
||||||
num_frames=8,
|
num_frames=8,
|
||||||
).to(torch_device, dtype=torch.bfloat16)
|
).to(torch_device, dtype=torch.float16)
|
||||||
|
|
||||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||||
|
|
||||||
@ -609,10 +612,11 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
|
("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
|
||||||
("cuda", 7): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an",
|
("cuda", 7): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded_output,
|
decoded_output,
|
||||||
expected_output,
|
expected_output,
|
||||||
@ -623,7 +627,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
|
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
|
||||||
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
|
("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
@ -635,7 +639,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
# Check third output
|
# Check third output
|
||||||
decoded_output = processor.decode(output[2], skip_special_tokens=True)
|
decoded_output = processor.decode(output[2], skip_special_tokens=True)
|
||||||
expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip
|
expected_output = (
|
||||||
|
"user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace."
|
||||||
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded_output,
|
decoded_output,
|
||||||
expected_output,
|
expected_output,
|
||||||
@ -657,7 +663,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
def test_llama_small_model_integration_generate(self):
|
def test_llama_small_model_integration_generate(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
image = Image.open(requests.get(url, stream=True).raw)
|
image = Image.open(requests.get(url, stream=True).raw)
|
||||||
@ -665,7 +671,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
prompt = (
|
prompt = (
|
||||||
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
||||||
)
|
)
|
||||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||||
decoded_output = processor.decode(
|
decoded_output = processor.decode(
|
||||||
@ -677,7 +683,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
def test_llama_small_model_integration_forward(self):
|
def test_llama_small_model_integration_forward(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
||||||
image = Image.open(requests.get(url, stream=True).raw)
|
image = Image.open(requests.get(url, stream=True).raw)
|
||||||
@ -685,7 +691,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
prompt = (
|
prompt = (
|
||||||
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
"<|im_start|>user\n<IMG_CONTEXT>\nPlease describe the image explicitly.<|im_end|>\n<|im_start|>assistant\n"
|
||||||
)
|
)
|
||||||
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||||
|
|
||||||
# Forward
|
# Forward
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
@ -695,12 +701,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
expected_logits_all = Expectations(
|
expected_logits_all = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
|
("xpu", 3): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.float16),
|
||||||
("cuda", 7): torch.tensor([-9.8750, -0.5703, 1.4297, -10.3125, -10.3125], dtype=torch.bfloat16),
|
("cuda", 7): torch.tensor([-9.8750, -0.4861, 1.4648, -10.3359, -10.3359], dtype=torch.float16),
|
||||||
("cuda", 8): torch.tensor([-9.8750, -0.5117, 1.4297, -10.3750, -10.3750], dtype=torch.bfloat16),
|
("cuda", 8): torch.tensor([-9.8906, -0.4995, 1.4473, -10.3359, -10.3438], dtype=torch.float16),
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.bfloat16)
|
expected_logits = torch.tensor(expected_logits_all.get_expectation(), dtype=torch.float16)
|
||||||
|
|
||||||
# The original implementation and the transformers implementation do not match exactly, hence the higher tolerance.
|
# The original implementation and the transformers implementation do not match exactly, hence the higher tolerance.
|
||||||
# The difference is likely due to the different implementations of the attention mechanism (different order of operations)
|
# The difference is likely due to the different implementations of the attention mechanism (different order of operations)
|
||||||
@ -716,22 +722,30 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
def test_llama_small_model_integration_generate_text_only(self):
|
def test_llama_small_model_integration_generate_text_only(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
|
prompt = "<|im_start|>user\nWrite a haiku<|im_end|>\n<|im_start|>assistant\n"
|
||||||
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.bfloat16)
|
inputs = processor(text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
|
||||||
decoded_output = processor.decode(
|
decoded_output = processor.decode(
|
||||||
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
||||||
)
|
)
|
||||||
expected_output = "Autumn leaves fall,\nNature's breath, a season's sigh,\nSilent woods awake."
|
|
||||||
|
expected_outputs = Expectations(
|
||||||
|
{
|
||||||
|
("cuda", 7): "Autumn leaves fall,\nNature's breath, a gentle sigh,\nSilent whispers.",
|
||||||
|
("cuda", 8): "Autumn leaves fall,\nNature's breath, a silent sigh,\nWinter's chill approaches.",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
expected_output = expected_outputs.get_expectation()
|
||||||
|
|
||||||
self.assertEqual(decoded_output, expected_output)
|
self.assertEqual(decoded_output, expected_output)
|
||||||
|
|
||||||
def test_llama_small_model_integration_generate_chat_template(self):
|
def test_llama_small_model_integration_generate_chat_template(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
@ -745,7 +759,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
inputs = processor.apply_chat_template(
|
inputs = processor.apply_chat_template(
|
||||||
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
|
||||||
).to(torch_device, dtype=torch.bfloat16)
|
).to(torch_device, dtype=torch.float16)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||||
decoded_output = processor.decode(
|
decoded_output = processor.decode(
|
||||||
@ -757,7 +771,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
def test_llama_small_model_integration_batched_generate(self):
|
def test_llama_small_model_integration_batched_generate(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
# Prepare inputs
|
# Prepare inputs
|
||||||
prompt = [
|
prompt = [
|
||||||
@ -768,7 +782,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
|
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
|
||||||
|
|
||||||
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
|
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
|
||||||
torch_device, dtype=torch.bfloat16
|
torch_device, dtype=torch.float16
|
||||||
)
|
)
|
||||||
|
|
||||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||||
@ -778,11 +792,12 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
|
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
|
||||||
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace.",
|
("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
||||||
("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nA wooden path leads to the sea,\nPeaceful, still waters.",
|
("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded_output,
|
decoded_output,
|
||||||
expected_output,
|
expected_output,
|
||||||
@ -791,7 +806,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
# Check second output
|
# Check second output
|
||||||
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
||||||
expected_output = 'user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters' # fmt: skip
|
expected_output = "user\n\nDescribe this image\nassistant\nThe image shows a street scene with a traditional Chinese gate in the background, adorned with red and gold colors and Chinese characters"
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded_output,
|
decoded_output,
|
||||||
expected_output,
|
expected_output,
|
||||||
@ -801,7 +816,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
def test_llama_small_model_integration_batched_generate_multi_image(self):
|
def test_llama_small_model_integration_batched_generate_multi_image(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
|
self.small_model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
|
||||||
)
|
)
|
||||||
# Prepare inputs
|
# Prepare inputs
|
||||||
prompt = [
|
prompt = [
|
||||||
@ -825,7 +840,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
|
inputs = processor(text=prompt, images=[[image1], [image2, image3]], padding=True, return_tensors="pt").to(
|
||||||
torch_device, dtype=torch.bfloat16
|
torch_device, dtype=torch.float16
|
||||||
)
|
)
|
||||||
|
|
||||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||||
@ -833,7 +848,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
# Check first output
|
# Check first output
|
||||||
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
decoded_output = processor.decode(output[0], skip_special_tokens=True)
|
||||||
# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
|
# Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
|
||||||
expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden path leads to calm lake,\nNature's peaceful grace." # fmt: skip
|
expected_output = "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors."
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded_output,
|
decoded_output,
|
||||||
expected_output,
|
expected_output,
|
||||||
@ -842,7 +858,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
# Check second output
|
# Check second output
|
||||||
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
decoded_output = processor.decode(output[1], skip_special_tokens=True)
|
||||||
expected_output = 'user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences' # fmt: skip
|
expected_output = "user\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences"
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded_output,
|
decoded_output,
|
||||||
expected_output,
|
expected_output,
|
||||||
@ -893,7 +909,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
def test_llama_small_model_integration_interleaved_images_videos(self):
|
def test_llama_small_model_integration_interleaved_images_videos(self):
|
||||||
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
|
||||||
model = InternVLForConditionalGeneration.from_pretrained(
|
model = InternVLForConditionalGeneration.from_pretrained(
|
||||||
self.small_model_checkpoint, torch_dtype=torch.bfloat16, device_map=torch_device
|
self.small_model_checkpoint, torch_dtype=torch.float16, device_map=torch_device
|
||||||
)
|
)
|
||||||
messages = [
|
messages = [
|
||||||
[
|
[
|
||||||
@ -945,7 +961,7 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
padding=True,
|
padding=True,
|
||||||
num_frames=8,
|
num_frames=8,
|
||||||
).to(torch_device, dtype=torch.bfloat16)
|
).to(torch_device, dtype=torch.float16)
|
||||||
|
|
||||||
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
|
||||||
|
|
||||||
@ -954,8 +970,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
|
("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
|
||||||
("cuda", 7): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that they are actually",
|
("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **',
|
||||||
("cuda", 8): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After closely examining the images again, I can see that there are several differences",
|
("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
@ -970,8 +986,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
||||||
("cuda", 7): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
|
||||||
("cuda", 8): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common shot in tennis where the player swings the racket across their",
|
("cuda", 8): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot. This is a common stroke in tennis where the player swings the racket across their',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
@ -986,8 +1002,8 @@ class InternVLLlamaIntegrationTest(unittest.TestCase):
|
|||||||
expected_outputs = Expectations(
|
expected_outputs = Expectations(
|
||||||
{
|
{
|
||||||
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
("xpu", 3): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
||||||
("cuda", 7): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
||||||
("cuda", 8): "user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.",
|
("cuda", 8): 'user\n\nWrite a haiku for this image\nassistant\nMajestic snow-capped peaks,\nWooden dock stretches to the sea,\nSilent water mirrors.',
|
||||||
}
|
}
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
expected_output = expected_outputs.get_expectation()
|
expected_output = expected_outputs.get_expectation()
|
||||||
|
@ -248,7 +248,7 @@ class Qwen3IntegrationTest(unittest.TestCase):
|
|||||||
tokenizer = AutoTokenizer.from_pretrained(qwen_model, pad_token="</s>", padding_side="right")
|
tokenizer = AutoTokenizer.from_pretrained(qwen_model, pad_token="</s>", padding_side="right")
|
||||||
if is_torch_greater_or_equal("2.7.0"):
|
if is_torch_greater_or_equal("2.7.0"):
|
||||||
strict = False # Due to https://github.com/pytorch/pytorch/issues/150994
|
strict = False # Due to https://github.com/pytorch/pytorch/issues/150994
|
||||||
EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unsalted, unsweetened, and unflavored."]
|
EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unflavoured, and unadulterated."]
|
||||||
else:
|
else:
|
||||||
strict = True
|
strict = True
|
||||||
EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unflavoured, and unadulterated. It is"]
|
EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unflavoured, and unadulterated. It is"]
|
||||||
|
@ -422,11 +422,13 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
|
|||||||
output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
|
output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
|
||||||
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
||||||
|
|
||||||
EXPECTED_OUTPUT_STR = (
|
EXPECTED_OUTPUT_STRS = [
|
||||||
"Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place"
|
# torch 2.6
|
||||||
" looks like a"
|
"Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place looks like a",
|
||||||
)
|
# torch 2.7
|
||||||
self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
|
"Today is a nice day and the sun is shining. A nice day with warm rainy and windy weather today.",
|
||||||
|
]
|
||||||
|
self.assertIn(output_str, EXPECTED_OUTPUT_STRS)
|
||||||
|
|
||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
@require_torch_fp16
|
@require_torch_fp16
|
||||||
|
Loading…
Reference in New Issue
Block a user