transformers/tests/models/mllama/test_modeling_mllama.py
Joao Gante 38f9f10dd9
Cache: revert DynamicCache init for BC (#33861)
* tmp commit

* tmp commit

* make fixup

* missing removal

* fix condition

* fix end-to-end compilation

* if -> elif

* BC

* BC

* use @deprecate_kwarg("num_hidden_layers", version="4.47.0")

* wups the import

* 🥴

---------

Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
2024-10-04 22:47:08 +02:00

671 lines
25 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Mllama model."""
import gc
import unittest
import requests
from transformers import (
AutoProcessor,
BitsAndBytesConfig,
MllamaConfig,
MllamaForCausalLM,
MllamaForConditionalGeneration,
is_torch_available,
is_vision_available,
)
from transformers.models.mllama.configuration_mllama import MllamaTextConfig
from transformers.testing_utils import (
is_flaky,
require_bitsandbytes,
require_read_token,
require_torch,
require_torch_gpu,
require_torch_sdpa,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
class MllamaText2TextModelTester:
def __init__(
self,
parent,
ignore_index=-100,
seq_length=7,
is_training=True,
text_config={
"model_type": "mllama",
"vocab_size": 99,
"hidden_size": 32,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"num_key_value_heads": 4,
"intermediate_size": 37,
"hidden_act": "gelu",
"max_position_embeddings": 512,
"initializer_range": 0.02,
"rope_scaling": {"rope_type": "default"},
"pad_token_id": 0,
"bos_token_id": 1,
"eos_token_id": 2,
},
):
self.parent = parent
self.ignore_index = ignore_index
self.text_config = text_config
self.seq_length = seq_length
self.num_hidden_layers = text_config["num_hidden_layers"]
self.vocab_size = text_config["vocab_size"]
self.hidden_size = text_config["hidden_size"]
self.num_attention_heads = text_config["num_attention_heads"]
self.is_training = is_training
self.pad_token_id = self.text_config["pad_token_id"]
self.batch_size = 3
def get_config(self):
return MllamaTextConfig(**self.text_config)
def prepare_config_and_inputs(self):
config = self.get_config()
input_ids = ids_tensor([self.batch_size, self.seq_length], config.vocab_size - 1) + 1
attention_mask = input_ids.ne(1).to(torch_device)
return config, input_ids, attention_mask
def prepare_config_and_inputs_for_common(self):
config, input_ids, attention_mask = self.prepare_config_and_inputs()
inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
return config, inputs_dict
def create_and_check_mllama_model_fp16_forward(self, config, input_ids, attention_mask):
model = MllamaForCausalLM(config=config)
model.to(torch_device)
model.eval()
with torch.autocast(device_type="cuda", dtype=torch.float16):
logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=True,
)["logits"]
self.parent.assertFalse(torch.isnan(logits).any().item())
@require_torch
class MllamaForCausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `MllamaForConditionalGeneration`.
"""
all_model_classes = (MllamaForCausalLM,) if is_torch_available() else ()
all_generative_model_classes = (MllamaForCausalLM,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
_torch_compile_test_ckpt = "nltpt/Llama-3.2-11B-Vision"
def setUp(self):
self.model_tester = MllamaText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=MllamaTextConfig, has_text_modality=True)
@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
@unittest.skip(reason="The outputs don't match, no idea why")
def test_beam_search_low_memory(self):
pass
@unittest.skip(reason="Quanto test is borken")
def test_generate_with_quant_cache(self):
pass
class MllamaVisionText2TextModelTester:
def __init__(
self,
parent,
ignore_index=-100,
image_token_index=4,
seq_length=7,
is_training=True,
text_config={
"model_type": "mllama",
"vocab_size": 99,
"hidden_size": 32,
"num_hidden_layers": 4,
"num_attention_heads": 4,
"num_key_value_heads": 4,
"intermediate_size": 37,
"hidden_act": "gelu",
"max_position_embeddings": 512,
"initializer_range": 0.02,
"rope_scaling": {"rope_type": "default"},
"pad_token_id": 0,
"bos_token_id": 1,
"eos_token_id": 2,
"cross_attention_layers": [1],
},
vision_config={
"image_size": 30,
"patch_size": 2,
"num_channels": 3,
"hidden_size": 16,
"intermediate_layers_indices": [0],
"vision_output_dim": 32,
"projection_dim": 32,
"num_hidden_layers": 6,
"num_global_layers": 2,
"num_attention_heads": 4,
"intermediate_size": 37,
"dropout": 0.1,
"initializer_range": 0.02,
"supported_aspect_ratios": [[1, 1], [1, 2], [1, 3], [1, 4], [2, 1], [2, 2], [3, 1], [4, 1]],
},
):
self.parent = parent
self.is_training = is_training
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.text_config = text_config
self.vision_config = vision_config
self.seq_length = seq_length
self.num_hidden_layers = text_config["num_hidden_layers"]
self.vocab_size = text_config["vocab_size"]
self.hidden_size = text_config["hidden_size"]
self.num_attention_heads = text_config["num_attention_heads"]
self.pad_token_id = self.text_config["pad_token_id"]
self.batch_size = 3
self.num_channels = 3
self.image_size = 224
self.max_num_images = 1
self.max_image_tiles = 4
def get_config(self):
return MllamaConfig(
text_config=self.text_config,
vision_config=self.vision_config,
image_token_index=self.image_token_index,
)
def prepare_config_and_inputs(self):
pixel_values = floats_tensor(
[
self.batch_size,
self.max_num_images,
self.max_image_tiles,
self.vision_config["num_channels"],
self.vision_config["image_size"],
self.vision_config["image_size"],
]
)
aspect_ratio_ids = torch.tensor([[6] * self.batch_size], device=torch_device).transpose(0, 1)
aspect_ratio_mask = torch.ones(self.batch_size, self.max_num_images, self.max_image_tiles)
config = self.get_config()
return config, pixel_values, aspect_ratio_ids, aspect_ratio_mask
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, aspect_ratio_ids, aspect_ratio_mask = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
attention_mask = input_ids.ne(1).to(torch_device)
aspect_ratio_mask = aspect_ratio_mask.to(torch_device)
cross_attention_mask = torch.ones(
(self.batch_size, self.seq_length, self.max_num_images, self.max_image_tiles), device=torch_device
)
input_ids[input_ids == config.image_token_index] = self.pad_token_id
input_ids[:, 1] = config.image_token_index
inputs_dict = {
"pixel_values": pixel_values,
"aspect_ratio_ids": aspect_ratio_ids,
"input_ids": input_ids,
"attention_mask": attention_mask,
"aspect_ratio_mask": aspect_ratio_mask,
"cross_attention_mask": cross_attention_mask,
"use_cache": True,
}
return config, inputs_dict
def create_and_check_mllama_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
model = MllamaForConditionalGeneration(config=config)
model.to(torch_device)
model.eval()
with torch.autocast(device_type="cuda", dtype=torch.float16):
logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
pixel_values=pixel_values.to(torch.bfloat16),
return_dict=True,
)["logits"]
self.parent.assertFalse(torch.isnan(logits).any().item())
@require_torch
class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `MllamaForConditionalGeneration`.
"""
all_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
test_torchscript = False
def setUp(self):
self.model_tester = MllamaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=MllamaConfig, has_text_modality=False)
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
wte = model.get_input_embeddings()
inputs["inputs_embeds"] = wte(input_ids)
with torch.no_grad():
model(**inputs)
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
# while some other models require pixel_values to be present
def test_inputs_embeds_matches_input_ids(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
inputs_embeds = model.get_input_embeddings()(input_ids)
with torch.no_grad():
out_ids = model(input_ids=input_ids, **inputs)[0]
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
self.assertTrue(torch.allclose(out_embeds, out_ids))
@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_generate(self):
super().test_eager_matches_sdpa_generate()
@require_torch_sdpa
@slow
@is_flaky()
def test_eager_matches_sdpa_inference_1_bfloat16(self):
# A workaround to override parametrized test with flaky decorator
super().test_eager_matches_sdpa_inference_1_bfloat16()
@unittest.skip(reason="Static cache not supported")
def test_static_cache_matches_dynamic(self):
# TypeError: list indices must be integers or slices, not tuple
# TODO: @raushan, please look into this for new cache format
pass
@unittest.skip(reason="Mllama has dynamic control flow which is not yet supported by compile")
def test_generate_compile_fullgraph(self):
pass
@unittest.skip(reason="The outputs don't match, no idea why")
def test_beam_search_low_memory(self):
pass
@unittest.skip(reason="Mllama is not yet supported by compile")
def test_sdpa_can_compile_dynamic(self):
# TODO: look into this, AttributeError("'tensor' object has no attribute '__pow__'")
# relevant issue: https://github.com/pytorch/pytorch/issues/133166
pass
@unittest.skip(reason="The test itself is broken") # TODO @zucchini-nlp
def test_generate_with_quant_cache(self):
pass
@unittest.skip(reason="AssertionError: Items in the second set but not the first: might be a setting issue")
def test_model_parallelism(self):
pass
@unittest.skip(reason="Failing test, need to fix")
def test_compile_cuda_graph_time(self):
pass
@unittest.skip(reason="Failing test, need to fix")
def test_torch_compile_fullgraph(self):
pass
@unittest.skip(reason="Device side assert triggered")
def test_assisted_decoding_with_num_logits_to_keep(self):
pass
@unittest.skip(reason="Failing test, need to fix")
def test_beam_sample_generate_dict_output(self):
pass
@unittest.skip(reason="Failing test, need to fix")
def test_beam_search_generate_dict_output(self):
pass
@unittest.skip(reason="Failing test, need to fix")
def test_constrained_beam_search_generate_dict_output(self):
pass
@unittest.skip(reason="Failing test, need to fix")
def test_dola_decoding_sample(self):
pass
@unittest.skip(reason="Failing test, need to fix")
def test_generate_methods_with_num_logits_to_keep(self):
pass
@unittest.skip(reason="Failing test, need to fix")
def test_greedy_generate_dict_outputs(self):
pass
@unittest.skip(reason="Failing test, need to fix")
def test_group_beam_search_generate_dict_output(self):
pass
@unittest.skip(reason="Failing test, need to fix")
def test_model_parallel_beam_search(self):
pass
@is_flaky() # TODO (joao, raushan) - investigate why this test is flaky (probably depends on the model initialization)
def test_new_cache_format_0(self):
super().test_new_cache_format_0()
@is_flaky() # TODO (joao, raushan) - investigate why this test is flaky (probably depends on the model initialization)
def test_new_cache_format_1(self):
super().test_new_cache_format_1()
@is_flaky() # TODO (joao, raushan) - investigate why this test is flaky (probably depends on the model initialization)
def test_new_cache_format_2(self):
super().test_new_cache_format_2()
@unittest.skip(reason="Failing test, need to fix")
def test_sample_generate_dict_output(self):
pass
def test_generate_text_only_with_cache(self):
"""
Tests that our cached generation with text-only inputs works. When mllama was introduced, this feature
required cache modifications (because layers are skipped in practice). This test should prevent regressions.
"""
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
model.generate(input_ids, use_cache=True)
@require_torch
class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
def setUp(self):
self.base_model_checkpoint = "meta-llama/Llama-3.2-11B-Vision"
self.instruct_model_checkpoint = "meta-llama/Llama-3.2-11B-Vision-Instruct"
def tearDown(self):
gc.collect()
torch.cuda.empty_cache()
@slow
@require_torch_gpu
@require_bitsandbytes
@require_read_token
def test_11b_model_integration_generate(self):
# Prepare inputs
processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
prompt = "<|image|>If I had to write a haiku for this one"
url = "https://llava-vl.github.io/static/images/view.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch_device)
# Check inputs ids
expected_input_ids = torch.tensor([[128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]], device=torch_device) # fmt: skip
self.assertTrue(torch.equal(inputs["input_ids"], expected_input_ids))
# Load model in 4 bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = MllamaForConditionalGeneration.from_pretrained(
self.base_model_checkpoint, quantization_config=quantization_config
)
# Generate
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
decoded_output = processor.decode(output[0], skip_special_tokens=True)
expected_output = "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a" # fmt: skip
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
@slow
@require_torch_gpu
@require_bitsandbytes
@require_read_token
def test_11b_model_integration_generate_text_only(self):
# Prepare inputs
processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
prompt = "If I had to write a haiku"
inputs = processor(text=prompt, return_tensors="pt").to(torch_device)
# Check inputs ids
expected_input_ids = [128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342]
self.assertEqual(inputs["input_ids"].cpu().squeeze().tolist(), expected_input_ids)
# Load model in 4 bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = MllamaForConditionalGeneration.from_pretrained(
self.base_model_checkpoint, quantization_config=quantization_config
)
# Generate
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
decoded_output = processor.decode(output[0], skip_special_tokens=True)
expected_output = "If I had to write a haiku about my life, I think it would be something like:\n\"Life is a messy stream\nTwists and turns, ups" # fmt: skip
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
@slow
@require_torch_gpu
@require_bitsandbytes
@require_read_token
def test_11b_model_integration_forward(self):
# Prepare inputs
processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
prompt = "<|image|>If I had to write a haiku for this one"
url = "https://llava-vl.github.io/static/images/view.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch_device)
# Load model in 4 bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = MllamaForConditionalGeneration.from_pretrained(
self.base_model_checkpoint, quantization_config=quantization_config
)
# Forward
with torch.inference_mode():
output = model(**inputs)
actual_logits = output.logits[0, -1, :5].cpu()
expected_logits = torch.tensor([8.3594, 7.7148, 4.7266, 0.7803, 3.1504])
self.assertTrue(
torch.allclose(actual_logits, expected_logits, atol=0.1),
f"Actual logits: {actual_logits}"
f"\nExpected logits: {expected_logits}"
f"\nDifference: {torch.abs(actual_logits - expected_logits)}",
)
@slow
@require_torch_gpu
@require_bitsandbytes
@require_read_token
def test_11b_model_integration_batched_generate(self):
processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
# Prepare inputs
prompt = [
"<|image|>If I had to write a haiku for this one",
"<|image|>This image shows",
]
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="pt").to(
torch_device
)
# Load model in 4 bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = MllamaForConditionalGeneration.from_pretrained(
self.base_model_checkpoint, quantization_config=quantization_config
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
# Check first output
decoded_output = processor.decode(output[0], skip_special_tokens=True)
expected_output = "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a" # fmt: skip
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
# Check second output
decoded_output = processor.decode(output[1], skip_special_tokens=True)
expected_output = "This image shows is a photograph of a stop sign in front of a Chinese archway. The stop sign is red with white letters and is" # fmt: skip
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)
@slow
@require_torch_gpu
@require_bitsandbytes
@require_read_token
def test_11b_model_integration_multi_image_generate(self):
processor = AutoProcessor.from_pretrained(self.instruct_model_checkpoint)
# Prepare inputs
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "Whats shown in this image?"},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "This image shows a long wooden dock extending out into a lake."}
],
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What about this one, what do you see here? Can you describe in detail?"},
],
},
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(text=prompt, images=[[image1, image2]], return_tensors="pt").to(torch_device)
prompt_len = inputs["input_ids"].shape[-1]
# Load model in 4 bit
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = MllamaForConditionalGeneration.from_pretrained(
self.instruct_model_checkpoint, quantization_config=quantization_config
)
output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
# Check first output
generated_output = output[0][prompt_len:]
decoded_output = processor.decode(generated_output, skip_special_tokens=False)
# model should response about "stop sign", however it responses about "dock"
# this happens only in quantized version, bfloat16 works fine
expected_output = "This image shows a long wooden dock extending out into a lake. The dock is made of wooden planks and has a railing"
self.assertEqual(
decoded_output,
expected_output,
f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
)