Expand inputs in processors for VLMs (#30962)

* let it be * draft * should not have changed * add warnings * fix & add tests * fix tests * ipnuts embeds cannot be passed with pixels * more updates * paligemma ready! * minor typos * update blip-2 * fix tests & raise error * docstring * add blip2 test * tmp * add image seq length to config * update docstring * delete * fix tests * fix blip * fix paligemma * out-of-place scatter * add llava-next-video * Update src/transformers/models/blip_2/modeling_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * remove tmp * codestyle * nits * more nits * remove overriding in tests * comprehension when merging video * fix-copies * revert changes for embeds test * fix tests after making comprehension * Update src/transformers/models/blip_2/processing_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * Update src/transformers/models/blip_2/processing_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * more updates * fix tests --------- Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
2025-07-24 14:58:56 +06:00 · 2024-08-13 10:14:39 +05:00 · 2024-08-13 10:14:39 +05:00 · a29eabd0eb
commit a29eabd0eb
parent 2a5a6ad18a
37 changed files with 1951 additions and 802 deletions
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@ -264,6 +264,8 @@ class Blip2Config(PretrainedConfig):
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

+        image_token_index (`int`, *optional*):
+            Token index of special image token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

@ -299,7 +301,15 @@ class Blip2Config(PretrainedConfig):

    model_type = "blip-2"

-    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+    def __init__(
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        image_token_index=None,
+        **kwargs,
+    ):
        super().__init__(**kwargs)

        if vision_config is None:
@ -323,6 +333,7 @@ class Blip2Config(PretrainedConfig):
        self.is_encoder_decoder = self.text_config.is_encoder_decoder

        self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index
        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
        self.initializer_factor = 1.0
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@ -1767,12 +1767,25 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel):
            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
        )
        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
-        expected_device = language_model_attention_mask.device
-        attention_mask = torch.cat([language_model_attention_mask, attention_mask.to(expected_device)], dim=1)
+
+        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concating
+        if getattr(self.config, "image_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+        else:
+            logger.warning_once(
+                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
+            )

        if self.config.use_decoder_only_language_model:
            outputs = self.language_model(
@ -1876,20 +1889,34 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel):
                .repeat(batch_size, 1)
                .to(image_embeds.device)
            )
+        inputs_embeds = self.get_input_embeddings()(input_ids)
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)

-        # concatenate query embeddings with prompt embeddings
-        inputs_embeds = self.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "image_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
+            )

-        # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-        # -1 is to account for the prepended BOS after `generate.`
-        # TODO (joao, raushan): refactor `generate` to avoid these operations with VLMs
-        if not self.language_model.config.is_encoder_decoder:
-            generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-            generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
+            # -1 is to account for the prepended BOS after `generate.`
+            # TODO (joao, raushan): refactor `generate` to avoid these operations with VLMs
+            if not self.language_model.config.is_encoder_decoder:
+                generate_kwargs["max_length"] = (
+                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
+                )
+                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]

        outputs = self.language_model.generate(
            inputs_embeds=inputs_embeds,
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@ -20,8 +20,18 @@ from typing import List, Optional, Union

 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...tokenization_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)


 class Blip2Processor(ProcessorMixin):
@ -36,20 +46,24 @@ class Blip2Processor(ProcessorMixin):
            An instance of [`BlipImageProcessor`]. The image processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+        num_query_tokens (`int`, *optional*):
+            Number of tokens used by the Qformer as queries, should be same as in model's config.
    """

    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = []
+    valid_kwargs = ["num_query_tokens"]
    image_processor_class = "BlipImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__init__
-    def __init__(self, image_processor, tokenizer, **kwargs):
+    def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
        tokenizer.return_token_type_ids = False
-        super().__init__(image_processor, tokenizer)
-        self.current_processor = self.image_processor
+        self.current_processor = image_processor
+        self.image_token = AddedToken("<image>", normalized=False, special=True)
+        tokenizer.add_tokens([self.image_token], special_tokens=True)
+        self.num_query_tokens = num_query_tokens
+
+        super().__init__(image_processor, tokenizer)

-    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__call__
    def __call__(
        self,
        images: ImageInput = None,
@ -106,7 +120,13 @@ class Blip2Processor(ProcessorMixin):
        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)

        if text is not None:
-            text_encoding = self.tokenizer(
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+            text_encoding = {}
+            _text_encoding = self.tokenizer(
                text=text,
                add_special_tokens=add_special_tokens,
                padding=padding,
@ -121,9 +141,30 @@ class Blip2Processor(ProcessorMixin):
                return_token_type_ids=return_token_type_ids,
                return_length=return_length,
                verbose=verbose,
-                return_tensors=return_tensors,
+                return_tensors=None,  # hardcode "None" here for prepending image tokens
                **kwargs,
            )
+
+            # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
+            # because BLIP expects image tokens to be at the beginning even before BOS token
+            if self.num_query_tokens is not None:
+                image_tokens = self.image_token.content * self.num_query_tokens
+                image_token_encoding = self.tokenizer([image_tokens], add_special_tokens=False, return_tensors=None)
+                for k in _text_encoding:
+                    text_encoding[k] = [
+                        img_encoding + txt_encoding
+                        for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
+                    ]
+            else:
+                text_encoding = _text_encoding
+                logger.warning_once(
+                    "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
+                    "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                )
+
+            # cast to desired return tensors type
+            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
        else:
            text_encoding = None

--- a/src/transformers/models/instructblip/configuration_instructblip.py
+++ b/src/transformers/models/instructblip/configuration_instructblip.py
@ -269,6 +269,8 @@ class InstructBlipConfig(PretrainedConfig):
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

+        image_token_index (`int`, *optional*):
+            Token index of special image token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

@ -304,7 +306,15 @@ class InstructBlipConfig(PretrainedConfig):

    model_type = "instructblip"

-    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+    def __init__(
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        image_token_index=None,
+        **kwargs,
+    ):
        super().__init__(**kwargs)

        if vision_config is None:
@ -328,6 +338,7 @@ class InstructBlipConfig(PretrainedConfig):
        self.is_encoder_decoder = self.text_config.is_encoder_decoder

        self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index
        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
        self.initializer_factor = 1.0
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@ -1453,12 +1453,24 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel):
        )

        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1)
+
+        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "image_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
+            )

        if self.config.use_decoder_only_language_model:
            outputs = self.language_model(
@ -1580,17 +1592,32 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel):
            )
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)

-        # concatenate query embeddings with prompt embeddings
        inputs_embeds = self.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)

-        # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-        # -1 is to account for the prepended BOS after `generate.`
-        if not self.language_model.config.is_encoder_decoder:
-            generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-            generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "image_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
+            )
+
+            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
+            # -1 is to account for the prepended BOS after `generate.`
+            if not self.language_model.config.is_encoder_decoder:
+                generate_kwargs["max_length"] = (
+                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
+                )
+                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]

        outputs = self.language_model.generate(
            inputs_embeds=inputs_embeds,
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@ -22,11 +22,21 @@ from typing import List, Optional, Union
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...tokenization_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from ...utils import TensorType, logging
 from ..auto import AutoTokenizer


+logger = logging.get_logger(__name__)
+
+
 class InstructBlipProcessor(ProcessorMixin):
    r"""
    Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
@ -42,18 +52,22 @@ class InstructBlipProcessor(ProcessorMixin):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`, *optional*):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
+        num_query_tokens (`int`, *optional*):"
+            Number of tokens used by the Qformer as queries, should be same as in model's config.
    """

    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = []
+    valid_kwargs = ["num_query_tokens"]
    image_processor_class = "BlipImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, **kwargs):
-        super().__init__(image_processor, tokenizer)
-
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, num_query_tokens=None, **kwargs):
        # add QFormer tokenizer
        self.qformer_tokenizer = qformer_tokenizer
+        self.image_token = AddedToken("<image>", normalized=False, special=True)
+        tokenizer.add_tokens([self.image_token], special_tokens=True)
+        self.num_query_tokens = num_query_tokens
+        super().__init__(image_processor, tokenizer)

    def __call__(
        self,
@ -87,7 +101,12 @@ class InstructBlipProcessor(ProcessorMixin):
        encoding = BatchFeature()

        if text is not None:
-            text_encoding = self.tokenizer(
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+            _text_encoding = self.tokenizer(
                text=text,
                add_special_tokens=add_special_tokens,
                padding=padding,
@ -102,9 +121,32 @@ class InstructBlipProcessor(ProcessorMixin):
                return_token_type_ids=return_token_type_ids,
                return_length=return_length,
                verbose=verbose,
-                return_tensors=return_tensors,
+                return_tensors=None,  # needed to concatenate below
                **kwargs,
            )
+
+            # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
+            # because BLIP expects image tokens to be at the beginning even before BOS token
+            if self.num_query_tokens is not None and images is not None:
+                text_encoding = {}
+                image_tokens = self.image_token.content * self.num_query_tokens
+                image_token_encoding = self.tokenizer([image_tokens], add_special_tokens=False, return_tensors=None)
+                for k in _text_encoding:
+                    text_encoding[k] = [
+                        img_encoding + txt_encoding
+                        for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
+                    ]
+            else:
+                text_encoding = _text_encoding
+                if images is not None:
+                    logger.warning_once(
+                        "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
+                        "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
+                        "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                    )
+
+            # cast to desired return tensors type after concatenating
+            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
            encoding.update(text_encoding)
            qformer_text_encoding = self.qformer_tokenizer(
                text=text,
--- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
@ -276,6 +276,8 @@ class InstructBlipVideoConfig(PretrainedConfig):
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

+        video_token_index (`int`, *optional*):
+            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

@ -311,7 +313,15 @@ class InstructBlipVideoConfig(PretrainedConfig):

    model_type = "instructblipvideo"

-    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+    def __init__(
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        video_token_index=None,
+        **kwargs,
+    ):
        super().__init__(**kwargs)

        if vision_config is None:
@ -335,6 +345,7 @@ class InstructBlipVideoConfig(PretrainedConfig):
        self.is_encoder_decoder = self.text_config.is_encoder_decoder

        self.num_query_tokens = num_query_tokens
+        self.video_token_index = video_token_index
        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
        self.initializer_factor = 1.0
--- a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py
@ -260,11 +260,24 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera
        )

        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1)
+
+        # if the model already has "video_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "video_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
+            )

        if self.config.use_decoder_only_language_model:
            outputs = self.language_model(
@ -394,17 +407,32 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGenera
            )
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)

-        # concatenate query embeddings with prompt embeddings
        inputs_embeds = self.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)

-        # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-        # -1 is to account for the prepended BOS after `generate.`
-        if not self.language_model.config.is_encoder_decoder:
-            generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-            generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+        # if the model already has "video_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "video_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
+            )
+
+            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
+            # -1 is to account for the prepended BOS after `generate.`
+            if not self.language_model.config.is_encoder_decoder:
+                generate_kwargs["max_length"] = (
+                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
+                )
+                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]

        outputs = self.language_model.generate(
            inputs_embeds=inputs_embeds,
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@ -1495,11 +1495,25 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
        )

        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)

        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1)
+
+        # if the model already has "video_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "video_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
+            )

        if self.config.use_decoder_only_language_model:
            outputs = self.language_model(
@ -1629,17 +1643,32 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
            )
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)

-        # concatenate query embeddings with prompt embeddings
        inputs_embeds = self.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)

-        # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-        # -1 is to account for the prepended BOS after `generate.`
-        if not self.language_model.config.is_encoder_decoder:
-            generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-            generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+        # if the model already has "video_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "video_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
+            )
+
+            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
+            # -1 is to account for the prepended BOS after `generate.`
+            if not self.language_model.config.is_encoder_decoder:
+                generate_kwargs["max_length"] = (
+                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
+                )
+                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]

        outputs = self.language_model.generate(
            inputs_embeds=inputs_embeds,
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@ -22,11 +22,21 @@ from typing import List, Optional, Union
 from ...image_processing_utils import BatchFeature
 from ...image_utils import VideoInput
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...tokenization_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from ...utils import TensorType, logging
 from ..auto import AutoTokenizer


+logger = logging.get_logger(__name__)
+
+
 class InstructBlipVideoProcessor(ProcessorMixin):
    r"""
    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
@ -42,18 +52,22 @@ class InstructBlipVideoProcessor(ProcessorMixin):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`, *optional*):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
+        num_query_tokens (`int`, *optional*):
+            Number of tokens used by the Qformer as queries, should be same as in model's config.
    """

    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = []
+    valid_kwargs = ["num_query_tokens"]
    image_processor_class = "InstructBlipVideoImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, **kwargs):
-        super().__init__(image_processor, tokenizer)
-
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, num_query_tokens=None, **kwargs):
        # add QFormer tokenizer
        self.qformer_tokenizer = qformer_tokenizer
+        self.video_token = AddedToken("<video>", normalized=False, special=True)
+        tokenizer.add_tokens([self.video_token], special_tokens=True)
+        self.num_query_tokens = num_query_tokens
+        super().__init__(image_processor, tokenizer)

    def __call__(
        self,
@ -84,7 +98,12 @@ class InstructBlipVideoProcessor(ProcessorMixin):
        encoding = BatchFeature()

        if text is not None:
-            text_encoding = self.tokenizer(
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+            _text_encoding = self.tokenizer(
                text=text,
                add_special_tokens=add_special_tokens,
                padding=padding,
@ -99,9 +118,34 @@ class InstructBlipVideoProcessor(ProcessorMixin):
                return_token_type_ids=return_token_type_ids,
                return_length=return_length,
                verbose=verbose,
-                return_tensors=return_tensors,
+                return_tensors=None,  # required to concatenate below
                **kwargs,
            )
+
+            # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
+            # because BLIP expects image tokens to be at the beginning even before BOS token
+            if self.num_query_tokens is not None and images is not None:
+                text_encoding = {}
+                video_tokens = (
+                    self.video_token.content * self.num_query_tokens * 4
+                )  # InstrucBLIP works with 4 frames only
+                video_token_encoding = self.tokenizer([video_tokens], add_special_tokens=False, return_tensors=None)
+                for k in _text_encoding:
+                    text_encoding[k] = [
+                        img_encoding + txt_encoding
+                        for img_encoding, txt_encoding in zip(video_token_encoding[k], _text_encoding[k])
+                    ]
+            else:
+                text_encoding = _text_encoding
+                if images is not None:
+                    logger.warning_once(
+                        "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
+                        "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
+                        "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                    )
+
+            # cast to desired return tensors type after concatenating
+            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
            encoding.update(text_encoding)
            qformer_text_encoding = self.qformer_tokenizer(
                text=text,
--- a/src/transformers/models/llava/configuration_llava.py
+++ b/src/transformers/models/llava/configuration_llava.py
@ -48,6 +48,8 @@ class LlavaConfig(PretrainedConfig):
            Can be one of `"default"` or `"full"`.
        vision_feature_layer (`int`, *optional*, defaults to -2):
            The index of the layer to select the vision feature.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.

    Example:

@ -82,11 +84,13 @@ class LlavaConfig(PretrainedConfig):
        projector_hidden_act="gelu",
        vision_feature_select_strategy="default",
        vision_feature_layer=-2,
+        image_seq_length=576,
        **kwargs,
    ):
        self.ignore_index = ignore_index
        self.image_token_index = image_token_index
        self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length

        if vision_feature_select_strategy not in ["default", "full"]:
            raise ValueError(
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@ -23,7 +23,6 @@ from torch import nn

 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache
 from ...modeling_outputs import ModelOutput
 from ...utils import (
    add_start_docstrings,
@ -230,6 +229,10 @@ LLAVA_INPUTS_DOCSTRING = r"""
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """


@ -373,6 +376,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
        r"""
        Args:
@ -419,63 +423,90 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
            else self.config.vision_feature_select_strategy
        )

+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        legacy_processing = False
        if inputs_embeds is None:
-            # 1. Extra the input embeddings
            inputs_embeds = self.get_input_embeddings()(input_ids)

-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1:
-                image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-                # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
-                selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)

-                if vision_feature_select_strategy == "default":
-                    selected_image_feature = selected_image_feature[:, 1:]
-                elif vision_feature_select_strategy == "full":
-                    selected_image_feature = selected_image_feature
+        if pixel_values is not None:
+            image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+            # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
+            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+            if vision_feature_select_strategy == "default":
+                selected_image_feature = selected_image_feature[:, 1:]
+            elif vision_feature_select_strategy == "full":
+                selected_image_feature = selected_image_feature
+            else:
+                raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+
+            image_features = self.multi_modal_projector(selected_image_feature)
+
+            if legacy_processing:
+                logger.warning_once(
+                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                )
+                # prefill stage vs decoding stage (legacy behavior copied)
+                if input_ids.shape[1] != 1:
+                    inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                        image_features, inputs_embeds, input_ids, attention_mask, labels
+                    )
                else:
-                    raise ValueError(
-                        f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+
+                    # Get the target length
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
                    )

-                image_features = self.multi_modal_projector(selected_image_feature)
-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+            # TODO: @raushan retain only the new behavior after v4.47
+            else:
+                special_image_mask = (
+                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
                )
-
-            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-            # generation with cache
-            elif past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)

        outputs = self.language_model(
            attention_mask=attention_mask,
@ -486,6 +517,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
+            cache_position=cache_position,
        )

        logits = outputs[0]
@ -519,56 +551,37 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel):
        )

    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        **kwargs,
    ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-            }
+        # Trigger the new behavior if we have more than image embeddings seq length tokens for images
+        legacy_processing = (
+            input_ids is not None
+            and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
        )
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        if legacy_processing:
+            model_inputs["pixel_values"] = pixel_values
+        elif cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
        return model_inputs

    def _reorder_cache(self, *args, **kwargs):
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@ -19,10 +19,13 @@ Processor class for Llava.
 from typing import List, Optional, Union

 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)


 class LlavaProcessor(ProcessorMixin):
@ -37,16 +40,35 @@ class LlavaProcessor(ProcessorMixin):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`], *optional*):
            The tokenizer is a required input.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Shoudl be same as in model's config
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
    """

    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        chat_template=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = image_token
        super().__init__(image_processor, tokenizer, chat_template=chat_template)

    def __call__(
@ -107,10 +129,42 @@ class LlavaProcessor(ProcessorMixin):
            image_inputs = self.image_processor(images, return_tensors=return_tensors)
        else:
            image_inputs = {}
-        text_inputs = self.tokenizer(
-            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
-        )

+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        # try to expand inputs in processing if we have the necessary parts
+        if image_inputs.get("pixel_values") is not None:
+            if self.patch_size is not None and self.vision_feature_select_strategy is not None:
+                # Replace the image token with the expanded image token sequence
+                pixel_values = image_inputs["pixel_values"]
+                height, width = get_image_size(to_numpy_array(pixel_values[0]))
+                num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
+                if self.vision_feature_select_strategy == "default":
+                    num_image_tokens -= 1
+
+                prompt_strings = []
+                for sample in text:
+                    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                    prompt_strings.append(sample)
+            else:
+                prompt_strings = text
+                logger.warning_once(
+                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                )
+
+        text_inputs = self.tokenizer(
+            prompt_strings,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+        )
        return BatchFeature(data={**text_inputs, **image_inputs})

    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
--- a/src/transformers/models/llava_next/configuration_llava_next.py
+++ b/src/transformers/models/llava_next/configuration_llava_next.py
@ -53,6 +53,8 @@ class LlavaNextConfig(PretrainedConfig):
            of the form `(height, width)`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.

    Example:

@ -89,11 +91,13 @@ class LlavaNextConfig(PretrainedConfig):
        vision_feature_layer=-2,
        image_grid_pinpoints=None,
        tie_word_embeddings=False,
+        image_seq_length=576,
        **kwargs,
    ):
        self.ignore_index = ignore_index
        self.image_token_index = image_token_index
        self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length

        if vision_feature_select_strategy not in ["default", "full"]:
            raise ValueError(
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@ -25,7 +25,6 @@ from torch import nn

 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache
 from ...image_processing_utils import select_best_resolution
 from ...modeling_outputs import ModelOutput
 from ...utils import (
@ -336,6 +335,10 @@ LLAVA_NEXT_INPUTS_DOCSTRING = r"""
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """


@ -708,6 +711,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
        r"""
        Args:
@ -754,104 +758,118 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
            else self.config.vision_feature_select_strategy
        )

+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        legacy_processing = False
        if inputs_embeds is None:
-            # 1. Extract the input embeddings
-            # In case image_token_index is not in the embeddings (extra token but embedding don't have it)
-            for_inputs_embeds_ids = input_ids.clone()
-            for_inputs_embeds_ids[(input_ids == self.config.image_token_index)] = 0
-            inputs_embeds = self.get_input_embeddings()(for_inputs_embeds_ids)
+            inputs_embeds = self.get_input_embeddings()(input_ids)

-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) > 0:
-                # ! infer image_num_patches from image_sizes
-                image_num_patches = [
-                    image_size_to_num_patches(
-                        image_size=imsize,
-                        grid_pinpoints=self.config.image_grid_pinpoints,
-                        patch_size=self.config.vision_config.image_size,
-                    )
-                    for imsize in image_sizes
+            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
+
+        if pixel_values is not None and pixel_values.size(0) > 0:
+            # ! infer image_num_patches from image_sizes
+            image_num_patches = [
+                image_size_to_num_patches(
+                    image_size=imsize,
+                    grid_pinpoints=self.config.image_grid_pinpoints,
+                    patch_size=self.config.vision_config.image_size,
+                )
+                for imsize in image_sizes
+            ]
+            # figure out if pixel_values is concatenated or stacked
+            if pixel_values.dim() == 5:
+                # stacking when input is (batch_size, num_patches, num_channels, height, width)
+                _pixel_values_list = [
+                    pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)
                ]
-                # figure out if pixel_values is concatenated or stacked
-                if pixel_values.dim() == 5:
-                    # stacking when input is (batch_size, num_patches, num_channels, height, width)
-                    _pixel_values_list = [
-                        pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)
-                    ]
-                    pixel_values = torch.cat(_pixel_values_list, dim=0)
-                elif pixel_values.dim() != 4:
-                    # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
-                    raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
+                pixel_values = torch.cat(_pixel_values_list, dim=0)
+            elif pixel_values.dim() != 4:
+                # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
+                raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")

-                image_features = self.vision_tower(pixel_values, output_hidden_states=True)
-                selected_image_feature = image_features.hidden_states[vision_feature_layer]
+            image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+            selected_image_feature = image_features.hidden_states[vision_feature_layer]
+            if vision_feature_select_strategy == "default":
+                selected_image_feature = selected_image_feature[:, 1:]
+            elif vision_feature_select_strategy == "full":
+                selected_image_feature = selected_image_feature
+            image_features = self.multi_modal_projector(selected_image_feature)
+            image_features = torch.split(image_features, image_num_patches, dim=0)

-                if vision_feature_select_strategy == "default":
-                    selected_image_feature = selected_image_feature[:, 1:]
-                elif vision_feature_select_strategy == "full":
-                    selected_image_feature = selected_image_feature
-
-                image_features = self.multi_modal_projector(selected_image_feature)
-
-                image_features = torch.split(image_features, image_num_patches, dim=0)
-
-                # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
-
-                image_features, feature_lens = self.pack_image_features(
-                    image_features,
-                    image_sizes,
-                    image_newline=self.image_newline,
+            # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
+            image_features, feature_lens = self.pack_image_features(
+                image_features,
+                image_sizes,
+                image_newline=self.image_newline,
+            )
+            if legacy_processing:
+                logger.warning_once(
+                    "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                )
+                if input_ids.shape[1] != 1:
+                    inputs_embeds = inputs_embeds.to(image_features.dtype)
+                    inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
+                        image_features,
+                        feature_lens,
+                        inputs_embeds,
+                        input_ids,
+                        attention_mask,
+                        position_ids,
+                        labels=labels,
+                    )
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]

-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
-                    image_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+
+                    # Get the target length
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+            # TODO: @raushan retain only the new behavior after v4.47
+            else:
+                special_image_mask = (
+                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
                )
-
-            # pixel_values is not None but is empty ---> text only cases
-            elif pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) == 0:
-                # there are no images
-                pass
-
-            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-            # generation with cache
-            elif past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)

        outputs = self.language_model(
            attention_mask=attention_mask,
@ -862,6 +880,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
+            cache_position=cache_position,
        )

        logits = outputs[0]
@ -902,57 +921,32 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel):
        pixel_values=None,
        image_sizes=None,
        attention_mask=None,
+        cache_position=None,
        **kwargs,
    ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "image_sizes": image_sizes,
-            }
+        legacy_processing = (
+            input_ids is not None
+            and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
        )
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        if legacy_processing:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["image_sizes"] = image_sizes
+        elif cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["image_sizes"] = image_sizes
+
        return model_inputs

    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration._reorder_cache
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@ -19,10 +19,14 @@ Processor class for LLaVa-NeXT.
 from typing import List, Optional, Union

 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput
+from ...image_processing_utils import select_best_resolution
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)


 class LlavaNextProcessor(ProcessorMixin):
@ -37,16 +41,35 @@ class LlavaNextProcessor(ProcessorMixin):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`], *optional*):
            The tokenizer is a required input.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Shoudl be same as in model's config
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
    """

    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        chat_template=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = image_token
        super().__init__(image_processor, tokenizer, chat_template=chat_template)

    def __call__(
@ -111,12 +134,89 @@ class LlavaNextProcessor(ProcessorMixin):
            image_inputs = self.image_processor(images, do_pad=do_pad, return_tensors=return_tensors)
        else:
            image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        if self.patch_size is None or self.vision_feature_select_strategy is None:
+            prompt_strings = text
+            logger.warning_once(
+                "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+        # cannot infer image expansion length if no images are found
+        elif not image_inputs:
+            prompt_strings = text
+        else:
+            image_sizes = image_inputs["image_sizes"]
+            height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
+            prompt_strings = []
+            for image_size, sample in zip(image_sizes, text):
+                # Replace the image token with the expanded image token sequence
+                orig_height, orig_width = image_size
+                num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
+                if self.vision_feature_select_strategy == "default":
+                    num_image_tokens -= 1
+
+                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
+
        text_inputs = self.tokenizer(
-            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+            prompt_strings,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
        )

        return BatchFeature(data={**text_inputs, **image_inputs})

+    def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
+        image_grid_pinpoints = self.image_processor.image_grid_pinpoints
+
+        height_best_resolution, width_best_resolution = select_best_resolution(
+            [orig_height, orig_width], image_grid_pinpoints
+        )
+        scale_height, scale_width = height_best_resolution // height, width_best_resolution // width
+
+        patches_height = height // self.patch_size
+        patches_width = width // self.patch_size
+        unpadded_features, newline_features = self._get_unpadded_features(
+            orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
+        )
+        # The base patch covers the entire image (+1 for the CLS)
+        base_features = patches_height * patches_width + 1
+        num_image_tokens = unpadded_features + newline_features + base_features
+        return num_image_tokens
+
+    def _get_unpadded_features(self, height, width, patches_height, patches_width, scale_height, scale_width):
+        """
+        Get number of features for a given image with height/width. LLaVA-NeXT is different from LLaVA
+        because it divided each image into patches depending on its resolution. Therefore we need to calculate how many
+        patches an image is divided into and get the number of features from that.
+        """
+        current_width = patches_height * scale_height
+        current_height = patches_width * scale_width
+
+        original_aspect_ratio = width / height
+        current_aspect_ratio = current_width / current_height
+        if original_aspect_ratio > current_aspect_ratio:
+            new_height = (height * current_width) // width
+            padding = (current_height - new_height) // 2
+            current_height -= padding * 2
+        else:
+            new_width = (width * current_height) // height
+            padding = (current_width - new_width) // 2
+            current_width -= padding * 2
+
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+        return (unpadded_features, newline_features)
+
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
@ -22,9 +22,15 @@

 from transformers import PretrainedConfig

+from ...utils import (
+    logging,
+)
 from ..auto import CONFIG_MAPPING


+logger = logging.get_logger(__name__)
+
+
 class LlavaNextVideoConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LlavaNextVideoForConditionalGeneration`]. It is used to instantiate an
@ -62,6 +68,10 @@ class LlavaNextVideoConfig(PretrainedConfig):
            Pooling mode to use for videos. Can be "average", "max" or "conv".
        spatial_pool_stride (`int`, *optional*, defaults to 2):
            Stride used in the pooling layer for videos.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        video_seq_length (`int`, *optional*, defaults to 288):
+            Sequence length of one video embedding.

    Example:

@ -99,11 +109,15 @@ class LlavaNextVideoConfig(PretrainedConfig):
        video_token_index=32000,
        spatial_pool_mode="average",
        spatial_pool_stride=2,
+        image_seq_length=576,
+        video_seq_length=288,
        **kwargs,
    ):
        self.video_token_index = video_token_index
        self.spatial_pool_mode = spatial_pool_mode
        self.spatial_pool_stride = spatial_pool_stride
+        self.image_seq_length = image_seq_length
+        self.video_seq_length = video_seq_length
        self.ignore_index = ignore_index
        self.image_token_index = image_token_index
        self.projector_hidden_act = projector_hidden_act
--- a/src/transformers/models/llava_next_video/diff_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/diff_llava_next_video.py
@ -64,6 +64,10 @@ class LlavaNextVideoConfig(PretrainedConfig):
            Pooling mode to use for videos. Can be "average", "max" or "conv".
        spatial_pool_stride (`int`, *optional*, defaults to 2):
            Stride used in the pooling layer for videos.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        video_seq_length (`int`, *optional*, defaults to 288):
+            Sequence length of one video embedding.
        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The activation function used by the multimodal projector.
        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
@ -114,11 +118,15 @@ class LlavaNextVideoConfig(PretrainedConfig):
        video_token_index=32000,
        spatial_pool_mode="average",
        spatial_pool_stride=2,
+        image_seq_length=576,
+        video_seq_length=288,
        **kwargs,
    ):
        self.video_token_index = video_token_index
        self.spatial_pool_mode = spatial_pool_mode
        self.spatial_pool_stride = spatial_pool_stride
+        self.image_seq_length = image_seq_length
+        self.video_seq_length = video_seq_length
        self.ignore_index = ignore_index
        self.image_token_index = image_token_index
        self.projector_hidden_act = projector_hidden_act
@ -375,90 +383,106 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration):
                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
            )

+        legacy_processing = False
        if inputs_embeds is None:
            inputs_embeds = self.get_input_embeddings()(input_ids)

-        # Merge text and images in prefill stage
-        if past_key_values is None:
-            # First merge image tokens if there are any
-            if pixel_values is not None and pixel_values.size(0) > 0:
-                image_features = self._get_image_features(pixel_values, image_sizes)
-                image_features, feature_lens = self.pack_image_features(
-                    image_features,
-                    image_sizes,
-                    image_newline=self.image_newline,
-                )
-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                (
-                    inputs_embeds,
-                    attention_mask,
-                    position_ids,
-                    labels,
-                    input_ids,
-                ) = self._merge_input_ids_with_image_features(
-                    image_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                    image_token_index=self.config.image_token_index,
-                )
-            # Then merge video tokens if there are any
-            if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
-                video_features = self._get_video_features(pixel_values_videos)
-                video_features = [feature.flatten(0, 1) for feature in video_features]
-                feature_lens = [feature.size(0) for feature in video_features]
-                video_features = torch.cat(video_features, dim=0)
-                feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=video_features.device)
-                (
-                    inputs_embeds,
-                    attention_mask,
-                    position_ids,
-                    labels,
-                    input_ids,
-                ) = self._merge_input_ids_with_image_features(
-                    video_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                    image_token_index=self.config.video_token_index,
-                )
-
-        # pixel_values is not None but is empty ---> text only cases
-        elif (pixel_values is not None and pixel_values.size(0) == 0) or (
-            pixel_values_videos is not None and pixel_values_videos.size(0) == 0
-        ):
-            pass
-
-        # generation with cache, decoding stage
-        elif past_key_values is not None and (pixel_values is not None or pixel_values_videos is not None):
-            # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
-            first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-            # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-            batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-            # Get the target length
-            target_length = input_ids.shape[1]
-            past_length = first_layer_past_key_value.shape[-1]
-            extended_attention_mask = torch.ones(
-                (attention_mask.shape[0], past_length),
-                dtype=attention_mask.dtype,
-                device=attention_mask.device,
+            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            img_token_count = (input_ids == self.config.image_token_index).sum(1).max()
+            video_token_count = (input_ids == self.config.video_token_index).sum(1).max()
+            inputs_expanded = (
+                img_token_count < self.config.image_seq_length and video_token_count < self.config.video_seq_length
            )
-            # Filter out only the tokens that can be un-attended, this can happen
-            # if one uses Llava + Fused modules where the cache on the
-            # first iteration is already big enough, or if one passes custom cache
-            valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-            new_batch_index = batch_index[valid_indices]
-            new_non_attended_tokens = non_attended_tokens[valid_indices]
-            # Zero-out the places where we don't need to attend
-            extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-            attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-            position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            pixels_present = input_ids.shape[-1] == 1 and pixel_values is not None and pixel_values_videos is not None
+            legacy_processing = inputs_expanded or pixels_present
+
+        image_features = feature_lens = None
+        if pixel_values is not None and pixel_values.size(0) > 0:
+            image_features = self._get_image_features(pixel_values, image_sizes)
+            image_features, feature_lens = self.pack_image_features(
+                image_features,
+                image_sizes,
+                image_newline=self.image_newline,
+            )
+
+        video_features = video_feature_lens = None
+        if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
+            video_features = self._get_video_features(pixel_values_videos)
+            video_features = [feature.flatten(0, 1) for feature in video_features]
+            video_feature_lens = [feature.size(0) for feature in video_features]
+            video_features = torch.cat(video_features, dim=0)
+            video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
+
+            if legacy_processing:
+                logger.warning_once(
+                    "Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                )
+                if input_ids.shape[1] != 1:
+                    iterator = (
+                        (image_features, feature_lens, self.config.image_token_index),
+                        (video_features, video_feature_lens, self.config.video_token_index),
+                    )
+                    for features, lens, special_token in zip(iterator):
+                        if features is not None:
+                            (
+                                inputs_embeds,
+                                attention_mask,
+                                position_ids,
+                                labels,
+                                input_ids,
+                            ) = self._merge_input_ids_with_image_features(
+                                features,
+                                lens,
+                                inputs_embeds,
+                                input_ids,
+                                attention_mask,
+                                position_ids,
+                                labels=labels,
+                                image_token_index=special_token,
+                            )
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                    # Get the target length
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+            # TODO: @raushan retain only the new behavior after v4.47
+            else:
+                if image_features is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+                if video_features is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)

        outputs = self.language_model(
            attention_mask=attention_mask,
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@ -376,6 +376,10 @@ LLAVA_NEXT_VIDEO_INPUTS_DOCSTRING = r"""
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """


@ -849,90 +853,106 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel):
                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
            )

+        legacy_processing = False
        if inputs_embeds is None:
            inputs_embeds = self.get_input_embeddings()(input_ids)

-        # Merge text and images in prefill stage
-        if past_key_values is None:
-            # First merge image tokens if there are any
-            if pixel_values is not None and pixel_values.size(0) > 0:
-                image_features = self._get_image_features(pixel_values, image_sizes)
-                image_features, feature_lens = self.pack_image_features(
-                    image_features,
-                    image_sizes,
-                    image_newline=self.image_newline,
-                )
-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                (
-                    inputs_embeds,
-                    attention_mask,
-                    position_ids,
-                    labels,
-                    input_ids,
-                ) = self._merge_input_ids_with_image_features(
-                    image_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                    image_token_index=self.config.image_token_index,
-                )
-            # Then merge video tokens if there are any
-            if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
-                video_features = self._get_video_features(pixel_values_videos)
-                video_features = [feature.flatten(0, 1) for feature in video_features]
-                feature_lens = [feature.size(0) for feature in video_features]
-                video_features = torch.cat(video_features, dim=0)
-                feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=video_features.device)
-                (
-                    inputs_embeds,
-                    attention_mask,
-                    position_ids,
-                    labels,
-                    input_ids,
-                ) = self._merge_input_ids_with_image_features(
-                    video_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                    image_token_index=self.config.video_token_index,
-                )
-
-        # pixel_values is not None but is empty ---> text only cases
-        elif (pixel_values is not None and pixel_values.size(0) == 0) or (
-            pixel_values_videos is not None and pixel_values_videos.size(0) == 0
-        ):
-            pass
-
-        # generation with cache, decoding stage
-        elif past_key_values is not None and (pixel_values is not None or pixel_values_videos is not None):
-            # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
-            first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-            # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-            batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-            # Get the target length
-            target_length = input_ids.shape[1]
-            past_length = first_layer_past_key_value.shape[-1]
-            extended_attention_mask = torch.ones(
-                (attention_mask.shape[0], past_length),
-                dtype=attention_mask.dtype,
-                device=attention_mask.device,
+            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            img_token_count = (input_ids == self.config.image_token_index).sum(1).max()
+            video_token_count = (input_ids == self.config.video_token_index).sum(1).max()
+            inputs_expanded = (
+                img_token_count < self.config.image_seq_length and video_token_count < self.config.video_seq_length
            )
-            # Filter out only the tokens that can be un-attended, this can happen
-            # if one uses Llava + Fused modules where the cache on the
-            # first iteration is already big enough, or if one passes custom cache
-            valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-            new_batch_index = batch_index[valid_indices]
-            new_non_attended_tokens = non_attended_tokens[valid_indices]
-            # Zero-out the places where we don't need to attend
-            extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-            attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-            position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            pixels_present = input_ids.shape[-1] == 1 and pixel_values is not None and pixel_values_videos is not None
+            legacy_processing = inputs_expanded or pixels_present
+
+        image_features = feature_lens = None
+        if pixel_values is not None and pixel_values.size(0) > 0:
+            image_features = self._get_image_features(pixel_values, image_sizes)
+            image_features, feature_lens = self.pack_image_features(
+                image_features,
+                image_sizes,
+                image_newline=self.image_newline,
+            )
+
+        video_features = video_feature_lens = None
+        if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
+            video_features = self._get_video_features(pixel_values_videos)
+            video_features = [feature.flatten(0, 1) for feature in video_features]
+            video_feature_lens = [feature.size(0) for feature in video_features]
+            video_features = torch.cat(video_features, dim=0)
+            video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
+
+            if legacy_processing:
+                logger.warning_once(
+                    "Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                )
+                if input_ids.shape[1] != 1:
+                    iterator = (
+                        (image_features, feature_lens, self.config.image_token_index),
+                        (video_features, video_feature_lens, self.config.video_token_index),
+                    )
+                    for features, lens, special_token in iterator:
+                        if features is not None:
+                            (
+                                inputs_embeds,
+                                attention_mask,
+                                position_ids,
+                                labels,
+                                input_ids,
+                            ) = self._merge_input_ids_with_image_features(
+                                features,
+                                lens,
+                                inputs_embeds,
+                                input_ids,
+                                attention_mask,
+                                position_ids,
+                                labels=labels,
+                                image_token_index=special_token,
+                            )
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                    # Get the target length
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+            # TODO: @raushan retain only the new behavior after v4.47
+            else:
+                if image_features is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+                if video_features is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)

        outputs = self.language_model(
            attention_mask=attention_mask,
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@ -19,7 +19,7 @@ Processor class for LLaVa-NeXT-Video.
 from typing import TYPE_CHECKING, List, Optional, Union

 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, VideoInput
+from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, logging
@ -48,17 +48,41 @@ class LlavaNextVideoProcessor(ProcessorMixin):
            The tokenizer is a required input.
        chat_template (`str`, *optional*):
            Jinja chat template that will be used in tokenizer's `apply_chat_template`
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Shoudl be same as in model's config
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
    """

    # video and image processor share same args, but have different processing logic
    # only image processor config is saved in the hub
    attributes = ["video_processor", "image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token", "video_token"]
    image_processor_class = "LlavaNextImageProcessor"
    video_processor_class = "LlavaNextVideoImageProcessor"
    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")

-    def __init__(self, video_processor=None, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+    def __init__(
+        self,
+        video_processor=None,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        video_token="<video>",
+        image_token="<image>",
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = image_token
+        self.video_token = video_token
        super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)

    def __call__(
@ -131,9 +155,62 @@ class LlavaNextVideoProcessor(ProcessorMixin):
        else:
            videos_inputs = {}

+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        print(self.patch_size, self.vision_feature_select_strategy, image_inputs, videos_inputs.keys())
+
+        if self.patch_size is None or self.vision_feature_select_strategy is None:
+            prompt_strings = text
+            logger.warning_once(
+                "Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+        # cannot infer image expansion length if no images/videos are found
+        elif not image_inputs and not videos_inputs:
+            prompt_strings = text
+        else:
+            # images expand taking into account num_of_patches in each image
+            if image_inputs:
+                image_sizes = image_inputs["image_sizes"]
+                height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
+                prompt_strings = []
+                for image_size, sample in zip(image_sizes, text):
+                    # Replace the image token with the expanded image token sequence
+                    orig_height, orig_width = image_size
+                    num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
+                    if self.vision_feature_select_strategy == "default":
+                        num_image_tokens -= 1
+
+                    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                    prompt_strings.append(sample)
+                text = prompt_strings
+
+            # videos are easier, simply get frames and multiply
+            if videos_inputs:
+                one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+                num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
+                num_video_tokens = num_image_tokens // 4 * num_frames  # divide by 4 needed for avg pooling layer
+
+                prompt_strings = []
+                for sample in text:
+                    sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
+                    prompt_strings.append(sample)
+
        text_inputs = self.tokenizer(
-            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+            prompt_strings,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
        )
+        print(text_inputs.keys())

        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})

--- a/src/transformers/models/paligemma/configuration_paligemma.py
+++ b/src/transformers/models/paligemma/configuration_paligemma.py
@ -86,7 +86,7 @@ class PaliGemmaConfig(PretrainedConfig):
        hidden_size=2048,
        **kwargs,
    ):
-        self.ignore_index = ignore_index
+        self._ignore_index = ignore_index
        self.image_token_index = image_token_index
        self._vocab_size = vocab_size
        self.projection_dim = projection_dim
@ -110,14 +110,11 @@ class PaliGemmaConfig(PretrainedConfig):
                vocab_size=257152,
                vision_use_head=False,
            )
-        self.vocab_size = self.vocab_size

        self.text_config = text_config
-
        if isinstance(self.text_config, dict):
            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gemma"
            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-            self.vocab_size = self.text_config.vocab_size
        elif text_config is None:
            self.text_config = CONFIG_MAPPING["gemma"](
                hidden_size=2048,
@ -132,6 +129,18 @@ class PaliGemmaConfig(PretrainedConfig):
        self.vision_config.projection_dim = projection_dim
        super().__init__(**kwargs)

+    @property
+    def ignore_index(self):
+        warnings.warn(
+            "The `ignore_index` attribute is deprecated and will be removed in v4.47.",
+            FutureWarning,
+        )
+        return self._ignore_index
+
+    @ignore_index.setter
+    def ignore_index(self, value):
+        self._ignore_index = value
+
    @property
    def vocab_size(self):
        warnings.warn(
@ -147,4 +156,5 @@ class PaliGemmaConfig(PretrainedConfig):
    def to_dict(self):
        output = super().to_dict()
        output.pop("_vocab_size", None)
+        output.pop("_ignore_index", None)
        return output
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@ -21,7 +21,7 @@ import torch
 import torch.utils.checkpoint
 from torch import nn

-from ...cache_utils import Cache
+from ...cache_utils import Cache, StaticCache
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
    ModelOutput,
@ -126,6 +126,9 @@ class PaliGemmaPreTrainedModel(PreTrainedModel):
    _no_split_modules = ["PaliGemmaMultiModalProjector"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = False
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
    _supports_sdpa = True
    _supports_cache_class = True

@ -222,6 +225,10 @@ PALIGEMMA_INPUTS_DOCSTRING = r"""
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """


@ -285,77 +292,52 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
        self.vocab_size = model_embeds.num_embeddings
        return model_embeds

-    def _merge_input_ids_with_image_features(
-        self, image_features, inputs_embeds, input_ids, attention_mask, labels, token_type_ids, cache_position
+    def _update_causal_mask(
+        self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False
    ):
-        _, _, embed_dim = image_features.shape
-        batch_size, sequence_length = input_ids.shape
+        using_static_cache = isinstance(past_key_values, StaticCache)
        dtype, device = inputs_embeds.dtype, inputs_embeds.device
        min_dtype = torch.finfo(dtype).min
-
-        scaled_image_features = image_features / (self.config.hidden_size**0.5)
-        final_embedding = torch.zeros(
-            batch_size, sequence_length, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
-        )
-
-        text_mask = (input_ids != self.config.image_token_index) & (input_ids != self.pad_token_id)
-        image_mask = input_ids == self.config.image_token_index
-        pad_mask = input_ids == self.pad_token_id
-
-        # expand masks to match embedding dimension
-        text_mask_expanded = text_mask.unsqueeze(-1).expand(-1, -1, embed_dim).to(inputs_embeds.device)
-        pad_mask_expanded = pad_mask.unsqueeze(-1).expand(-1, -1, embed_dim).to(inputs_embeds.device)
-        # insert padding and text token embeddings
-        final_embedding = torch.where(text_mask_expanded, inputs_embeds, final_embedding)
-        final_embedding = torch.where(pad_mask_expanded, torch.zeros_like(final_embedding), final_embedding)
-        # insert image embeddings - the image mask is always less or equal to the sentence in length
-        final_embedding = final_embedding.masked_scatter(
-            image_mask.unsqueeze(-1).expand_as(final_embedding).to(device=final_embedding.device),
-            scaled_image_features.to(device=final_embedding.device, dtype=final_embedding.dtype),
-        )
-        final_embedding = torch.where(pad_mask_expanded, torch.zeros_like(final_embedding), final_embedding)
-        if attention_mask is not None:
-            position_ids = (attention_mask.cumsum(-1)).masked_fill_((attention_mask == 0), 1)
+        sequence_length = inputs_embeds.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
        else:
-            position_ids = None
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[0] + sequence_length + 1
+            )

-        if token_type_ids is not None and labels is not None:
-            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
-            target_length = cache_position[-1] + 1
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
+            # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-            causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
-            if attention_mask is not None:
-                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
-                    causal_mask.device
-                )
-                # unmask the prefill
+                if is_training:
+                    causal_mask = torch.triu(causal_mask, diagonal=1)
+                else:
+                    causal_mask = torch.zeros_like(causal_mask)
+
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
+            if is_training:
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
                )
-                padding_mask = padding_mask == 0
-                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    padding_mask, min_dtype
-                )
-
-            final_labels = torch.full(
-                (batch_size, sequence_length), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
-            )
-            final_labels = torch.where(input_ids != self.pad_token_id, labels, final_labels)
-        else:
-            causal_mask = attention_mask.unsqueeze(1).unsqueeze(2) * attention_mask.unsqueeze(1).unsqueeze(-1)
-            # invert causal mask
-            causal_mask = torch.where(causal_mask == 0, min_dtype, 0)
-            causal_mask = causal_mask.to(dtype).expand(-1, self.config.text_config.num_key_value_heads, -1, -1)
-            final_labels = None
-
-        return final_embedding, causal_mask, final_labels, position_ids
+        return causal_mask

    @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@ -411,66 +393,63 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
            )

+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

-        # the attention mask is turned 4d after, we keep track of the original one
-        input_attention_mask = attention_mask
+        is_training = token_type_ids is not None and labels is not None

        if inputs_embeds is None:
-            # 1. Extra the input embeddings
            inputs_embeds = self.get_input_embeddings()(input_ids)

-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1:
-                image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype))
-                selected_image_feature = image_outputs.last_hidden_state
-                image_features = self.multi_modal_projector(selected_image_feature)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )

-                if cache_position is None:
-                    cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device)
-                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels, token_type_ids, cache_position
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0) + 1  # Paligemma positions are 1-indexed
+
+        # Merge text and images
+        if pixel_values is not None:
+            image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype))
+            selected_image_feature = image_outputs.last_hidden_state
+            image_features = self.multi_modal_projector(selected_image_feature)
+            image_features = image_features / (self.config.hidden_size**0.5)
+
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            if inputs_embeds[special_image_mask].numel() != image_features.numel():
+                image_tokens_in_text = torch.sum(input_ids == self.config.image_token_index)
+                raise ValueError(
+                    f"Number of images does not match number of special image tokens in the input text. "
+                    f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
+                    "tokens from image embeddings."
                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)

-            else:
-                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-                # generation with cache
-                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
-                    # Retrieve the first layer to inspect the logits and mask out the hidden states
-                    # that are set to 0
-                    # TODO @molbap this will only work for dynamic cache.
-                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+        # mask out pad-token-ids in labels for BC
+        if labels is not None and self.pad_token_id in labels:
+            logger.warning_once(
+                "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. ",
+                "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
+            )
+            labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)

-                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training
+        )

-                    # Get the target length
-                    target_seqlen = cache_position[-1] + 1
-                    extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1] + 1),
-                        dtype=attention_mask.dtype,
-                        device=attention_mask.device,
-                    )
-                    # Filter out only the tokens that can be un-attended, this can happen
-                    # if one uses PaliGemma+ Fused modules where the cache on the
-                    # first iteration is already big enough, or if one passes custom cache
-                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                    new_batch_index = batch_index[valid_indices]
-                    new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                    # Zero-out the places where we don't need to attend
-                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
-                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-
-        attention_mask = attention_mask.to(inputs_embeds.dtype)
        outputs = self.language_model(
-            attention_mask=attention_mask,
+            attention_mask=causal_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
@ -487,9 +466,9 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
        if labels is not None:
            shift_logits = logits[..., :-1, :]
            shift_labels = labels[..., 1:]
-            if input_attention_mask is not None:
+            if attention_mask is not None:
                # we use the input attention mask to shift the logits and labels, because it is 2D.
-                shift_attention_mask = input_attention_mask[..., 1:]
+                shift_attention_mask = attention_mask[..., 1:]
                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
            else:
@ -498,7 +477,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss()

-            flat_logits = shift_logits.view(-1, self.config.vocab_size)
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
            flat_labels = shift_labels.view(-1).to(shift_logits.device)
            loss = loss_fct(flat_logits, flat_labels)
        if not return_dict:
@ -526,37 +505,24 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
        use_cache=True,
        **kwargs,
    ):
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        if past_key_values is not None:
-            if inputs_embeds is not None:  # Exception 1
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "cache_position": cache_position,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "token_type_ids": token_type_ids,
-            }
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
        )
+
+        model_inputs["token_type_ids"] = token_type_ids
+
+        # position_ids in Paligemma are 1-indexed
+        if model_inputs.get("position_ids") is not None:
+            model_inputs["position_ids"] += 1
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+
        return model_inputs
--- a/src/transformers/models/video_llava/configuration_video_llava.py
+++ b/src/transformers/models/video_llava/configuration_video_llava.py
@ -51,6 +51,10 @@ class VideoLlavaConfig(PretrainedConfig):
            Can be either "full" to select all features or "default" to select features without `CLS`.
        vision_feature_layer (`int`, *optional*, defaults to -2):
            The index of the layer to select the vision feature.
+        image_seq_length (`int`, *optional*, defaults to 256):
+            Sequence length of one image embedding.
+        video_seq_length (`int`, *optional*, defaults to 2056):
+            Sequence length of one video embedding.

    Example:

@ -86,6 +90,8 @@ class VideoLlavaConfig(PretrainedConfig):
        projector_hidden_act="gelu",
        vision_feature_select_strategy="default",
        vision_feature_layer=-2,
+        image_seq_length=256,
+        video_seq_length=2056,
        **kwargs,
    ):
        self.ignore_index = ignore_index
@ -94,6 +100,8 @@ class VideoLlavaConfig(PretrainedConfig):
        self.projector_hidden_act = projector_hidden_act
        self.vision_feature_select_strategy = vision_feature_select_strategy
        self.vision_feature_layer = vision_feature_layer
+        self.image_seq_length = image_seq_length
+        self.video_seq_length = video_seq_length

        self.vision_config = vision_config

--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@ -23,7 +23,6 @@ from torch import nn

 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache
 from ...modeling_outputs import BaseModelOutputWithPooling, ModelOutput
 from ...utils import (
    add_start_docstrings,
@ -228,6 +227,10 @@ VIDEO_LLAVA_INPUTS_DOCSTRING = r"""
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """


@ -413,6 +416,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, VideoLlavaCausalLMOutputWithPast]:
        r"""
        Args:
@ -503,51 +507,71 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
            else self.config.vision_feature_select_strategy
        )

+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if (pixel_values_images is not None or pixel_values_videos is not None) and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        legacy_processing = False
        if inputs_embeds is None:
-            # 1. Extra the input embeddings
            inputs_embeds = self.get_input_embeddings()(input_ids)

-            # 2. Merge text and images
-            if (pixel_values_images is not None or pixel_values_videos is not None) and input_ids.shape[1] != 1:
-                image_outputs, video_outputs, num_frames = self._get_vision_features(
-                    pixel_values_images=pixel_values_images,
-                    pixel_values_videos=pixel_values_videos,
-                    vision_feature_layer=vision_feature_layer,
-                    vision_feature_select_strategy=vision_feature_select_strategy,
-                )
+            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            img_token_count = (input_ids == self.config.image_token_index).sum(1).max()
+            video_token_count = (input_ids == self.config.video_token_index).sum(1).max()
+            inputs_expanded = (
+                img_token_count < self.config.image_seq_length and video_token_count < self.config.video_seq_length
+            )
+            pixels_present = (
+                input_ids.shape[-1] == 1 and pixel_values_images is not None and pixel_values_videos is not None
+            )
+            legacy_processing = inputs_expanded or pixels_present

-                # first add image embeds where possible, then expand again and add video embeds
-                if image_outputs is not None:
-                    visual_features = self.multi_modal_projector(image_outputs)
-                    (
-                        inputs_embeds,
-                        attention_mask,
-                        labels,
-                        position_ids,
-                        input_ids,
-                    ) = self._merge_input_ids_with_visual_features(
-                        visual_features, inputs_embeds, input_ids, attention_mask, labels
-                    )
-                if video_outputs is not None:
-                    visual_features = self.multi_modal_projector(video_outputs)
-                    (
-                        inputs_embeds,
-                        attention_mask,
-                        labels,
-                        position_ids,
-                        _,
-                    ) = self._merge_input_ids_with_visual_features(
-                        visual_features,
-                        inputs_embeds,
-                        input_ids,
-                        attention_mask,
-                        labels,
-                        num_frames=num_frames,
-                    )
-            else:
-                # In case input_ids.shape[1] == 1 & past_key_values != None, we are in the case of
-                # generation with cache
-                if past_key_values is not None and input_ids.shape[1] == 1:
+        if pixel_values_images is not None or pixel_values_videos is not None:
+            image_outputs, video_outputs, num_frames = self._get_vision_features(
+                pixel_values_images=pixel_values_images,
+                pixel_values_videos=pixel_values_videos,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+            )
+
+            image_features = video_features = None
+            if image_outputs is not None:
+                image_features = self.multi_modal_projector(image_outputs)
+            if video_outputs is not None:
+                video_features = self.multi_modal_projector(video_outputs)
+
+            if legacy_processing:
+                logger.warning_once(
+                    "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                )
+                if input_ids.shape[1] != 1:
+                    for features, frames in ((image_features, 1), (video_features, num_frames)):
+                        if features is not None:
+                            (
+                                inputs_embeds,
+                                attention_mask,
+                                labels,
+                                position_ids,
+                                input_ids,
+                            ) = self._merge_input_ids_with_visual_features(
+                                features,
+                                inputs_embeds,
+                                input_ids,
+                                attention_mask,
+                                labels,
+                                num_frames=frames,
+                            )
+                else:
                    # Retrieve the first layer to inspect the logits and mask out the hidden states
                    # that are set to 0
                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
@ -577,6 +601,22 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1

+            # TODO: @raushan retain only the new behavior after v4.47
+            else:
+                if image_outputs is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+                if video_outputs is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
+
        outputs = self.language_model(
            attention_mask=attention_mask,
            position_ids=position_ids,
@ -586,6 +626,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
+            cache_position=cache_position,
        )

        logits = outputs[0]
@ -626,60 +667,40 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
        pixel_values_images=None,
        pixel_values_videos=None,
        attention_mask=None,
+        cache_position=None,
        **kwargs,
    ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            else:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-
-            pixel_values_videos = None
-            pixel_values_images = None
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values_videos": pixel_values_videos,
-                "pixel_values_images": pixel_values_images,
-            }
+        # Trigger the new behavior if we have more than image embeddings seq length tokens for images
+        legacy_processing = input_ids is not None and (
+            (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            and (input_ids == self.config.video_token_index).sum(1).max() < self.config.video_seq_length
        )
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        if legacy_processing:
+            # legacy specific code copied from prev version, we assume that we always have one more new token (assisted decoding doesn't work for VLMs)
+            # if cache_position[0] != 0:
+            #     model_inputs["input_ids"] = model_inputs["input_ids"][:, -1:]
+            #     if "position_ids" in model_inputs:
+            #         model_inputs["position_ids"] = model_inputs["position_ids"][:, -1:]
+
+            model_inputs["pixel_values_images"] = pixel_values_images
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+
+        elif cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values_images"] = pixel_values_images
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+
        return model_inputs

    def _reorder_cache(self, *args, **kwargs):
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@ -19,10 +19,13 @@ Processor class for VideoLlava.
 from typing import List, Optional, Union

 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)


 class VideoLlavaProcessor(ProcessorMixin):
@ -37,16 +40,39 @@ class VideoLlavaProcessor(ProcessorMixin):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`], *optional*):
            The tokenizer is a required input.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Shoudl be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    """

    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token", "video_token"]
    image_processor_class = "VideoLlavaImageProcessor"
    tokenizer_class = "AutoTokenizer"

-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        video_token="<video>",
+        chat_template=None,
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = image_token
+        self.video_token = video_token
        super().__init__(image_processor, tokenizer, chat_template=chat_template)

    def __call__(
@ -114,8 +140,46 @@ class VideoLlavaProcessor(ProcessorMixin):
            encoded_images = self.image_processor(images=images, videos=videos, return_tensors=return_tensors)
            data.update(encoded_images)

+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        if encoded_images is not None and self.patch_size is None or self.vision_feature_select_strategy is None:
+            prompt_strings = text
+            logger.warning_once(
+                "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+            )
+        elif encoded_images is not None:
+            # Replace the image token with the expanded image token sequence
+            if "pixel_values" in encoded_images:
+                height, width = get_image_size(to_numpy_array(encoded_images.get("pixel_values")[0]))
+                num_frames = 1
+            else:
+                one_video = to_numpy_array(encoded_images.get("pixel_values_videos")[0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+
+            num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
+            num_video_tokens = num_image_tokens * num_frames
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
+                prompt_strings.append(sample)
+
        text_inputs = self.tokenizer(
-            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+            prompt_strings,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
        )
        data.update(text_inputs)

--- a/src/transformers/models/vipllava/configuration_vipllava.py
+++ b/src/transformers/models/vipllava/configuration_vipllava.py
@ -47,6 +47,8 @@ class VipLlavaConfig(PretrainedConfig):
            The layer norm epsilon of the projector layernorm
        vision_feature_layers (`List[int]`, *optional*, defaults to `[-2, -5, -8, -11, 6]`):
            The list of layers to select the vision features from.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.

    Example:

@ -81,6 +83,7 @@ class VipLlavaConfig(PretrainedConfig):
        projector_hidden_act="gelu",
        projector_layernorm_eps=1e-5,
        vision_feature_layers=[-2, -5, -8, -11, 6],
+        image_seq_length=576,
        **kwargs,
    ):
        self.ignore_index = ignore_index
@ -88,6 +91,7 @@ class VipLlavaConfig(PretrainedConfig):
        self.projector_hidden_act = projector_hidden_act
        self.projector_layernorm_eps = projector_layernorm_eps
        self.vision_feature_layers = vision_feature_layers
+        self.image_seq_length = image_seq_length
        self.vision_config = vision_config

        if isinstance(self.vision_config, dict):
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@ -23,7 +23,6 @@ from torch import nn

 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache
 from ...modeling_outputs import ModelOutput
 from ...utils import (
    add_start_docstrings,
@ -231,6 +230,10 @@ VIPLLAVA_INPUTS_DOCSTRING = r"""
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """


@ -375,6 +378,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
    ) -> Union[Tuple, VipLlavaCausalLMOutputWithPast]:
        r"""
        Args:
@ -419,26 +423,48 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
            vision_feature_layers if vision_feature_layers is not None else self.config.vision_feature_layers
        )

+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        legacy_processing = False
        if inputs_embeds is None:
-            # 1. Extra the input embeddings
            inputs_embeds = self.get_input_embeddings()(input_ids)

-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1:
-                image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-                # For VIP-llava, the image features are computed this way
-                # We select the features from index 1: for the layers -2, -5, -8, -11 and 6
-                image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers]
-                image_features = torch.cat(image_features, dim=-1)
+            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)

-                image_features = self.multi_modal_projector(image_features)
-                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels
+        if pixel_values is not None:
+            image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+
+            # For VIP-llava, the image features are computed this way
+            # We select the features from index 1: for the layers -2, -5, -8, -11 and 6
+            image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers]
+            image_features = torch.cat(image_features, dim=-1)
+            image_features = self.multi_modal_projector(image_features)
+
+            if legacy_processing:
+                logger.warning_once(
+                    "Expanding inputs for image tokens in VipLLaVa should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                )
-            else:
-                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-                # generation with cache
-                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                # prefill stage vs decoding stage (legacy behavior copied)
+                if input_ids.shape[1] != 1:
+                    inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                        image_features, inputs_embeds, input_ids, attention_mask, labels
+                    )
+                else:
                    # Retrieve the first layer to inspect the logits and mask out the hidden states
                    # that are set to 0
                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
@ -468,6 +494,14 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1

+            # TODO: @raushan retain only the new behavior after v4.47
+            else:
+                special_image_mask = (
+                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                )
+                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
        outputs = self.language_model(
            attention_mask=attention_mask,
            position_ids=position_ids,
@ -477,6 +511,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
+            cache_position=cache_position,
        )

        logits = outputs[0]
@ -510,56 +545,37 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
        )

    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        **kwargs,
    ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-            }
+        # Trigger the new behavior if we have more than image embeddings seq length tokens for images
+        legacy_processing = (
+            input_ids is not None
+            and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
        )
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        if legacy_processing:
+            model_inputs["pixel_values"] = pixel_values
+        elif cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
        return model_inputs

    def _reorder_cache(self, *args, **kwargs):
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@ -1033,3 +1033,33 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
            [0, 3, 7, 152, 67, 839, 1],
        )
        self.assertEqual(generated_text, "san diego")
+
+    def test_expansion_in_processing(self):
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
+        ).to(torch_device)
+
+        image = prepare_img()
+        prompt = "Question: which city is this? Answer:"
+
+        # Make sure we will go the legacy path by setting these args to None
+        processor.num_query_tokens = None
+        model.config.image_token_index = None
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Add args to the config to trigger new logic when inputs are expanded in processing file
+        processor.num_query_tokens = model.config.num_query_tokens
+        processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
+        model.config.image_token_index = len(processor.tokenizer) - 1
+        model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64)
+
+        # Generate again with new inputs
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+        predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
+
+        self.assertTrue(generated_text_expanded == generated_text)
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@ -637,3 +637,35 @@ class InstructBlipModelIntegrationTest(unittest.TestCase):
            predictions[0].tolist(), [0, 37, 1023, 753, 3, 9, 2335, 3823, 30, 8, 2608, 28, 3, 9, 1782, 5, 1]
        )
        self.assertEqual(generated_text, "The image features a woman sitting on the beach with a dog.")
+
+    def test_expansion_in_processing(self):
+        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
+        model = InstructBlipForConditionalGeneration.from_pretrained(
+            "Salesforce/instructblip-flan-t5-xl",
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+        ).to(torch_device)
+
+        image = prepare_img()
+        prompt = "What's in the image?"
+
+        # Make sure we will go the legacy path by setting these args to None
+        processor.num_query_tokens = None
+        model.config.image_token_index = None
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Add args to the config to trigger new logic when inputs are expanded in processing file
+        processor.num_query_tokens = model.config.num_query_tokens
+        processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
+        model.config.image_token_index = len(processor.tokenizer) - 1
+        model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64)
+
+        # Generate again with new inputs
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+        predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
+
+        self.assertTrue(generated_text_expanded == generated_text)
--- a/tests/models/instructblip/test_processor_instructblip.py
+++ b/tests/models/instructblip/test_processor_instructblip.py
@ -119,7 +119,7 @@ class InstructBlipProcessorTest(unittest.TestCase):
            tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
        )

-        input_str = "lower newer"
+        input_str = ["lower newer"]

        encoded_processor = processor(text=input_str)

--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@ -583,3 +583,33 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase):
            generated_text,
            "a baby girl wearing glasses is reading a book on the bed 1080p",
        )
+
+    def test_expansion_in_processing(self):
+        processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
+        model = InstructBlipVideoForConditionalGeneration.from_pretrained(
+            "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
+        )
+
+        clip = prepare_video()
+        prompt = "Explain what is happening in this short video."
+
+        # Make sure we will go the legacy path by setting these args to None
+        processor.num_query_tokens = None
+        model.config.video_token_index = None
+        inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Add args to the config to trigger new logic when inputs are expanded in processing file
+        processor.num_query_tokens = model.config.num_query_tokens
+        processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<video>"]})
+        model.config.video_token_index = len(processor.tokenizer) - 1
+        model.resize_token_embeddings(len(processor.tokenizer), pad_to_multiple_of=64)
+
+        # Generate again with new inputs
+        inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+        predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
+
+        self.assertTrue(generated_text_expanded == generated_text)
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@ -186,6 +186,49 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
        self.model_tester = LlavaVisionText2TextModelTester(self)
        self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)

+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
    @unittest.skip(
        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
    )
@ -471,3 +514,33 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):

        # Make sure that `generate` works
        _ = model.generate(**inputs, max_new_tokens=20)
+
+    @slow
+    @require_bitsandbytes
+    def test_expansion_in_processing(self):
+        model_id = "llava-hf/llava-1.5-7b-hf"
+        model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs.input_ids.shape[-1] == 18)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@ -237,6 +237,49 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                    )

+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
    @unittest.skip(
        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
    )
@ -505,3 +548,33 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
        with torch.no_grad():
            output_train = model(**inputs_batched, output_hidden_states=True)
        self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item())
+
+    @slow
+    @require_bitsandbytes
+    def test_expansion_in_processing(self):
+        model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
+        model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2356)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs.input_ids.shape[-1] == 17)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@ -252,8 +252,8 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                    )

+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
    def test_inputs_embeds(self):
-        # overwrite because llava can't support both inputs_embeds and pixel values at ipnut
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
@ -274,6 +274,29 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
            with torch.no_grad():
                model(**inputs)

+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+            del inputs["pixel_values_videos"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
    @unittest.skip(
        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
    )
@ -487,3 +510,31 @@ class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
        with torch.no_grad():
            output_train = model(**inputs_batched, output_hidden_states=True)
        self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item())
+
+    @slow
+    @require_bitsandbytes
+    def test_expansion_in_processing(self):
+        model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
+        model = LlavaNextVideoForConditionalGeneration.from_pretrained(
+            "llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 1170)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
+        self.assertTrue(inputs.input_ids.shape[-1] == 19)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@ -53,9 +53,9 @@ class PaliGemmaVisionText2TextModelTester:
        self,
        parent,
        ignore_index=-100,
-        image_token_index=98,
+        image_token_index=0,
        projector_hidden_act="gelu",
-        seq_length=7,
+        seq_length=25,
        vision_feature_select_strategy="default",
        vision_feature_layer=-1,
        projection_dim=32,
@ -87,8 +87,8 @@ class PaliGemmaVisionText2TextModelTester:
        is_training=True,
        vision_config={
            "use_labels": True,
-            "image_size": 30,
-            "patch_size": 2,
+            "image_size": 20,
+            "patch_size": 5,
            "num_image_tokens": 4,
            "num_channels": 3,
            "is_training": True,
@ -106,6 +106,7 @@ class PaliGemmaVisionText2TextModelTester:
    ):
        self.parent = parent
        self.ignore_index = ignore_index
+        # `image_token_index` is set to 0 to pass "resize_embeddings" test, do not modify
        self.image_token_index = image_token_index
        self.projector_hidden_act = projector_hidden_act
        self.vision_feature_select_strategy = vision_feature_select_strategy
@ -157,8 +158,10 @@ class PaliGemmaVisionText2TextModelTester:
        config, pixel_values = config_and_inputs
        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
        attention_mask = input_ids.ne(1).to(torch_device)
-        # setting the 4 first tokens to be image
-        input_ids[:, :4] = config.image_token_index
+        # set the 16 first tokens to be image, and ensure that no other tokens are image tokens
+        # do not change this unless you modified image size or patch size
+        input_ids = torch.where(input_ids == config.image_token_index, 2, input_ids)
+        input_ids[:, :16] = config.image_token_index
        inputs_dict = {
            "pixel_values": pixel_values,
            "input_ids": input_ids,
@ -185,6 +188,49 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.Test
        self.model_tester = PaliGemmaVisionText2TextModelTester(self)
        self.config_tester = ConfigTester(self, config_class=PaliGemmaConfig, has_text_modality=False)

+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
    @unittest.skip(
        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
    )
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@ -322,6 +322,51 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
            for key in model_batched_output:
                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)

+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values_images"]
+            del inputs["pixel_values_videos"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values_images"]
+            del inputs["pixel_values_videos"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+

@require_torch
 class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
@ -545,3 +590,35 @@ class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
            labels=input_ids,
        ).loss
        loss.backward()
+
+    @slow
+    @require_bitsandbytes
+    def test_expansion_in_processing(self):
+        model_id = "LanguageBind/Video-LLaVA-7B-hf"
+        model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = VideoLlavaProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <video>Describe the video in details. ASSISTANT:"
+        video_file = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
+        )
+        video_file = np.load(video_file)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2073)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs.input_ids.shape[-1] == 18)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@ -167,6 +167,49 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestC
        self.model_tester = VipLlavaVisionText2TextModelTester(self)
        self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)

+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
    @unittest.skip(
        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
    )
@ -260,3 +303,33 @@ class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
            labels=input_ids,
        ).loss
        loss.backward()
+
+    @slow
+    @require_bitsandbytes
+    def test_expansion_in_processing(self):
+        model_id = "llava-hf/vip-llava-7b-hf"
+        model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs.input_ids.shape[-1] == 18)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())