# Copyright 2024 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Testing suite for the PyTorch Llava-NeXT model.""" import unittest import numpy as np import requests from huggingface_hub import hf_hub_download from parameterized import parameterized from transformers import ( AutoProcessor, LlavaOnevisionConfig, LlavaOnevisionForConditionalGeneration, LlavaOnevisionModel, is_torch_available, is_vision_available, ) from transformers.testing_utils import ( cleanup, require_bitsandbytes, require_torch, slow, torch_device, ) from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor, ) if is_torch_available(): import torch if is_vision_available(): from PIL import Image class LlavaOnevisionVisionText2TextModelTester: def __init__( self, parent, ignore_index=-100, image_token_index=1, projector_hidden_act="gelu", seq_length=7, vision_feature_select_strategy="full", vision_feature_layer=-1, text_config={ "model_type": "qwen2", "seq_length": 7, "is_training": True, "use_input_mask": True, "use_token_type_ids": False, "use_labels": True, "vocab_size": 99, "hidden_size": 32, "num_hidden_layers": 2, "num_attention_heads": 4, "num_key_value_heads": 4, "intermediate_size": 37, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 580, "type_vocab_size": 16, "type_sequence_label_size": 2, "initializer_range": 0.02, "num_labels": 3, "num_choices": 4, "pad_token_id": 0, }, is_training=True, vision_config={ "image_size": 16, "patch_size": 8, "num_channels": 3, "is_training": True, "hidden_size": 32, "projection_dim": 32, "num_hidden_layers": 2, "num_attention_heads": 4, "intermediate_size": 37, "dropout": 0.1, "attention_dropout": 0.1, "initializer_range": 0.02, }, ): self.parent = parent self.ignore_index = ignore_index self.image_token_index = image_token_index self.projector_hidden_act = projector_hidden_act self.vision_feature_select_strategy = vision_feature_select_strategy self.vision_feature_layer = vision_feature_layer self.text_config = text_config self.vision_config = vision_config self.pad_token_id = text_config["pad_token_id"] self.num_image_tokens = 10 self.seq_length = seq_length + self.num_image_tokens self.num_hidden_layers = text_config["num_hidden_layers"] self.vocab_size = text_config["vocab_size"] self.hidden_size = text_config["hidden_size"] self.num_attention_heads = text_config["num_attention_heads"] self.is_training = is_training self.batch_size = 3 self.num_channels = 3 self.image_size = 30 self.image_grid_pinpoints = [[16, 16]] def get_config(self): return LlavaOnevisionConfig( text_config=self.text_config, vision_config=self.vision_config, ignore_index=self.ignore_index, image_token_index=self.image_token_index, projector_hidden_act=self.projector_hidden_act, vision_feature_select_strategy=self.vision_feature_select_strategy, vision_feature_layer=self.vision_feature_layer, image_grid_pinpoints=self.image_grid_pinpoints, ) def prepare_config_and_inputs(self): pixel_values = floats_tensor( [ self.batch_size, 3, self.vision_config["num_channels"], self.vision_config["image_size"], self.vision_config["image_size"], ] ) config = self.get_config() return config, pixel_values def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values = config_and_inputs input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) input_ids[input_ids == config.image_token_index] = self.pad_token_id input_ids[:, : self.num_image_tokens] = config.image_token_index labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device) labels[:, : self.num_image_tokens] == self.ignore_index inputs_dict = { "pixel_values": pixel_values, "image_sizes": torch.tensor([[45, 45]] * self.batch_size), "input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, } return config, inputs_dict @require_torch class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): """ Model tester for `LlavaOnevisionForConditionalGeneration`. """ all_model_classes = ( ( LlavaOnevisionModel, LlavaOnevisionForConditionalGeneration, ) if is_torch_available() else () ) pipeline_model_mapping = ( {"image-text-to-text": LlavaOnevisionForConditionalGeneration} if is_torch_available() else {} ) test_pruning = False test_head_masking = False _is_composite = True def setUp(self): self.model_tester = LlavaOnevisionVisionText2TextModelTester(self) common_properties = ["image_token_index", "video_token_index", "vision_feature_layer"] self.config_tester = ConfigTester( self, config_class=LlavaOnevisionConfig, has_text_modality=False, common_properties=common_properties ) def test_config(self): self.config_tester.run_common_tests() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: model = model_class(config=configs_no_init) for name, param in model.named_parameters(): # LLaVa Onevision has SigLIP backbone which init weights differently from CLIP if "image_newline" in name or "vision_tower" in name: continue elif param.requires_grad: self.assertIn( ((param.data.mean() * 1e9).round() / 1e9).item(), [0.0, 1.0], msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) model.to(torch_device) model.eval() inputs = self._prepare_for_class(inputs_dict, model_class) input_ids = inputs["input_ids"] del inputs["input_ids"] del inputs["pixel_values"] wte = model.get_input_embeddings() inputs["inputs_embeds"] = wte(input_ids) with torch.no_grad(): model(**inputs) # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs # while some other models require pixel_values to be present def test_inputs_embeds_matches_input_ids(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model = model_class(config) model.to(torch_device) model.eval() inputs = self._prepare_for_class(inputs_dict, model_class) input_ids = inputs["input_ids"] del inputs["input_ids"] del inputs["pixel_values"] inputs_embeds = model.get_input_embeddings()(input_ids) with torch.no_grad(): out_ids = model(input_ids=input_ids, **inputs)[0] out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] torch.testing.assert_close(out_embeds, out_ids) def test_odd_sized_image(self): # prepare model configuration config = self.model_tester.get_config() # prepare input num_image_tokens = 10 pixel_values = floats_tensor([1, 2, 3, config.vision_config.image_size, config.vision_config.image_size]) input_ids = ids_tensor([1, 64], config.text_config.vocab_size - 2) + 2 input_ids[:, :num_image_tokens] = config.image_token_index attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) inputs_dict = { "pixel_values": pixel_values, "image_sizes": torch.tensor([[13, 16]]), # odd-sized image "input_ids": input_ids, "attention_mask": attention_mask, } # forward with odd-sized image input for model_class in self.all_model_classes: model = model_class(config).to(torch_device) model(**inputs_dict) @parameterized.expand( [ (-1,), ([-1],), ([-1, -2],), ], ) def test_vision_feature_layers(self, vision_feature_layer): """ Test that we can use either one vision feature layer, or a list of vision feature layers. """ config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() config.vision_feature_layer = vision_feature_layer num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer) hidden_size = config.vision_config.hidden_size expected_features = hidden_size * num_feature_layers for model_class in self.all_model_classes: model = model_class(config).to(torch_device) # We should have the right number of input features, # and should be able to run a forward pass without exploding base_model = getattr(model, "model", model) assert base_model.multi_modal_projector.linear_1.in_features == expected_features model(**input_dict) @unittest.skip( reason="This architecture seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training" ) def test_training_gradient_checkpointing(self): pass @unittest.skip( reason="This architecture seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training" ) def test_training_gradient_checkpointing_use_reentrant(self): pass @unittest.skip( reason="This architecture seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training" ) def test_training_gradient_checkpointing_use_reentrant_false(self): pass @unittest.skip( "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" ) def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): pass @require_torch class LlavaOnevisionForConditionalGenerationIntegrationTest(unittest.TestCase): def setUp(self): self.processor = AutoProcessor.from_pretrained( "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", padding_side="left" ) image_file = hf_hub_download( repo_id="raushan-testing-hf/images_test", filename="llava_v1_5_radar.jpg", repo_type="dataset" ) video_file = hf_hub_download( repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" ) self.image = Image.open(image_file) self.video = np.load(video_file) self.prompt_image = "user\n\nWhat do you see in this image?<|im_end|>\n<|im_start|>assistant\n" self.prompt_video = "user\n