From 8198d49871b617afe7074b91e3314bd57b23989b Mon Sep 17 00:00:00 2001
From: yaswanth <yaswanthgali8@gmail.com>
Date: Fri, 7 Mar 2025 13:59:06 +0530
Subject: [PATCH 01/62] Model skelton

---
 docs/source/en/_toctree.yml                   |   69 +
 docs/source/en/model_doc/aimv2.md             |   82 ++
 src/transformers/__init__.py                  |   32 +
 src/transformers/models/__init__.py           |    1 +
 src/transformers/models/aimv2/__init__.py     |   29 +
 .../models/aimv2/configuration_aimv2.py       |  422 ++++++
 .../convert_aimv2_original_pytorch_to_hf.py   |  156 +++
 .../models/aimv2/modeling_aimv2.py            |    0
 .../models/auto/configuration_auto.py         |    2 +
 .../models/auto/feature_extraction_auto.py    |    1 +
 src/transformers/models/auto/modeling_auto.py |    3 +
 .../models/auto/processing_auto.py            |    1 +
 .../models/auto/tokenization_auto.py          |    7 +
 tests/models/aimv2/__init__.py                |    0
 tests/models/aimv2/test_modeling_aimv2.py     | 1140 +++++++++++++++++
 15 files changed, 1945 insertions(+)
 create mode 100644 docs/source/en/model_doc/aimv2.md
 create mode 100644 src/transformers/models/aimv2/__init__.py
 create mode 100644 src/transformers/models/aimv2/configuration_aimv2.py
 create mode 100644 src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
 create mode 100644 src/transformers/models/aimv2/modeling_aimv2.py
 create mode 100644 tests/models/aimv2/__init__.py
 create mode 100644 tests/models/aimv2/test_modeling_aimv2.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 00d898e4d18..7dfa4d33be4 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1,3 +1,4 @@
+- sections:
 - sections:
   - local: index
     title: Transformers
@@ -6,8 +7,11 @@
   - local: quicktour
     title: Quickstart
   title: Get started
+- isExpanded: false
+  title: Get started
 - isExpanded: false
   sections:
+  - sections:
   - sections:
     - local: models
       title: Loading models
@@ -30,6 +34,8 @@
     - local: attention
       title: Attention mechanisms
     title: Models
+  - sections:
+    title: Models
   - sections:
     - local: fast_tokenizers
       title: Tokenizers
@@ -47,8 +53,12 @@
       title: Padding and truncation
     title: Preprocessors
   title: Base classes
+- isExpanded: false
+    title: Preprocessors
+  title: Base classes
 - isExpanded: false
   sections:
+  - sections:
   - sections:
     - local: pipeline_tutorial
       title: Pipeline
@@ -59,6 +69,8 @@
     - local: add_new_pipeline
       title: Adding a new pipeline
     title: Pipeline API
+  - sections:
+    title: Pipeline API
   - sections:
     - local: llm_tutorial
       title: Text generation
@@ -81,6 +93,8 @@
     - local: perplexity
       title: Perplexity of fixed-length models
     title: LLMs
+  - sections:
+    title: LLMs
   - sections:
     - local: conversations
       title: Chat basics
@@ -93,6 +107,8 @@
     - local: chat_extras
       title: Tools and RAG
     title: Chat with models
+  - sections:
+    title: Chat with models
   - sections:
     - local: perf_torch_compile
       title: torch.compile
@@ -105,13 +121,17 @@
     - local: tf_xla
       title: XLA
     title: Optimization
+    title: Optimization
   - local: agents
     title: Agents
   - local: tools
     title: Tools
   title: Inference
+- isExpanded: false
+  title: Inference
 - isExpanded: false
   sections:
+  - sections:
   - sections:
     - local: trainer
       title: Trainer
@@ -122,6 +142,8 @@
     - local: hpo_train
       title: Hyperparameter search
     title: Trainer API
+  - sections:
+    title: Trainer API
   - sections:
     - local: gpu_selection
       title: GPU selection
@@ -138,6 +160,8 @@
     - local: perf_train_gpu_many
       title: Parallelism methods
     title: Distributed training
+  - sections:
+    title: Distributed training
   - sections:
     - local: perf_train_gpu_one
       title: GPU
@@ -150,11 +174,14 @@
     - local: perf_hardware
       title: Build your own machine
     title: Hardware
+    title: Hardware
   - local: peft
     title: PEFT
   - local: model_memory_anatomy
     title: Model training anatomy
   title: Training
+- isExpanded: false
+  title: Training
 - isExpanded: false
   sections:
   - local: quantization/overview
@@ -198,6 +225,8 @@
   - local: quantization/contribute
     title: Contribute
   title: Quantization
+- isExpanded: false
+  title: Quantization
 - isExpanded: false
   sections:
   - local: serialization
@@ -209,8 +238,12 @@
   - local: torchscript
     title: TorchScript
   title: Export to production
+- isExpanded: false
+  title: Export to production
 - isExpanded: false
   sections:
+  - sections:
+    - sections:
   - sections:
     - sections:
       - local: tasks/sequence_classification
@@ -230,12 +263,16 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Natural language processing
+    - sections:
+      title: Natural language processing
     - sections:
       - local: tasks/audio_classification
         title: Audio classification
       - local: tasks/asr
         title: Automatic speech recognition
       title: Audio
+    - sections:
+      title: Audio
     - sections:
       - local: tasks/image_classification
         title: Image classification
@@ -262,6 +299,8 @@
       - local: tasks/knowledge_distillation_for_image_classification
         title: Knowledge Distillation for Computer Vision
       title: Computer vision
+    - sections:
+      title: Computer vision
     - sections:
       - local: tasks/image_captioning
         title: Image captioning
@@ -279,6 +318,8 @@
         title: Video-text-to-text
       title: Multimodal
     title: Task recipes
+      title: Multimodal
+    title: Task recipes
   - local: run_scripts
     title: Training scripts
   - local: glossary
@@ -292,6 +333,8 @@
   - local: troubleshooting
     title: Troubleshoot
   title: Resources
+- isExpanded: false
+  title: Resources
 - isExpanded: false
   sections:
   - local: contributing
@@ -301,8 +344,11 @@
   - local: pr_checks
     title: Pull request checks
   title: Contribute
+- isExpanded: false
+  title: Contribute
 - isExpanded: false
   sections:
+  - sections:
   - sections:
     - local: main_classes/agent
       title: Agents and Tools
@@ -351,6 +397,9 @@
     - local: main_classes/image_processor
       title: Image Processor
     title: Main classes
+  - sections:
+    - sections:
+    title: Main classes
   - sections:
     - sections:
       - local: model_doc/albert
@@ -664,6 +713,8 @@
       - local: model_doc/zamba2
         title: Zamba2
       title: Text models
+    - sections:
+      title: Text models
     - sections:
       - local: model_doc/beit
         title: BEiT
@@ -794,6 +845,8 @@
       - local: model_doc/zoedepth
         title: ZoeDepth
       title: Vision models
+    - sections:
+      title: Vision models
     - sections:
       - local: model_doc/audio-spectrogram-transformer
         title: Audio Spectrogram Transformer
@@ -864,6 +917,8 @@
       - local: model_doc/xlsr_wav2vec2
         title: XLSR-Wav2Vec2
       title: Audio models
+    - sections:
+      title: Audio models
     - sections:
       - local: model_doc/timesformer
         title: TimeSformer
@@ -873,6 +928,8 @@
         title: ViViT
       title: Video models
     - sections:
+      - local: model_doc/aimv2
+        title: AIMv2
       - local: model_doc/align
         title: ALIGN
       - local: model_doc/altclip
@@ -1020,12 +1077,16 @@
       - local: model_doc/xclip
         title: X-CLIP
       title: Multimodal models
+    - sections:
+      title: Multimodal models
     - sections:
       - local: model_doc/decision_transformer
         title: Decision Transformer
       - local: model_doc/trajectory_transformer
         title: Trajectory Transformer
       title: Reinforcement learning models
+    - sections:
+      title: Reinforcement learning models
     - sections:
       - local: model_doc/autoformer
         title: Autoformer
@@ -1038,11 +1099,16 @@
       - local: model_doc/time_series_transformer
         title: Time Series Transformer
       title: Time series models
+    - sections:
+      title: Time series models
     - sections:
       - local: model_doc/graphormer
         title: Graphormer
       title: Graph models
     title: Models
+  - sections:
+      title: Graph models
+    title: Models
   - sections:
     - local: internal/modeling_utils
       title: Custom Layers and Utilities
@@ -1066,3 +1132,6 @@
       title: Utilities for Time Series
     title: Internal helpers
   title: API
+
+    title: Internal helpers
+  title: API
diff --git a/docs/source/en/model_doc/aimv2.md b/docs/source/en/model_doc/aimv2.md
new file mode 100644
index 00000000000..1c33a06c927
--- /dev/null
+++ b/docs/source/en/model_doc/aimv2.md
@@ -0,0 +1,82 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# AIMv2
+
+## Overview
+
+The AIMv2 model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## AIMv2Config
+
+[[autodoc]] AIMv2Config
+    - from_text_vision_configs
+
+## AIMv2TextConfig
+
+[[autodoc]] AIMv2TextConfig
+
+## AIMv2VisionConfig
+
+[[autodoc]] AIMv2VisionConfig
+
+## AIMv2Model
+
+[[autodoc]] AIMv2Model
+    - forward
+    - get_text_features
+    - get_image_features
+
+## AIMv2TextModel
+
+[[autodoc]] AIMv2TextModel
+    - forward
+
+## AIMv2TextModelWithProjection
+
+[[autodoc]] AIMv2TextModelWithProjection
+    - forward
+
+## AIMv2VisionModelWithProjection
+
+[[autodoc]] AIMv2VisionModelWithProjection
+    - forward
+
+## AIMv2VisionModel
+
+[[autodoc]] AIMv2VisionModel
+    - forward
+
+## AIMv2ForImageClassification
+
+[[autodoc]] AIMv2ForImageClassification
+    - forward
+
+</pt>
+<tf>
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e5caa17d25c..321580896d7 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -289,6 +289,11 @@ _import_structure = {
         "CLIPTokenizer",
         "CLIPVisionConfig",
     ],
+    "models.aimv2": [
+        "AIMv2Config",
+        "AIMv2TextConfig",
+        "AIMv2VisionConfig",
+    ],
     "models.clipseg": [
         "CLIPSegConfig",
         "CLIPSegProcessor",
@@ -1852,6 +1857,17 @@ else:
             "CLIPVisionModelWithProjection",
         ]
     )
+    _import_structure["models.aimv2"].extend(
+        [
+            "AIMv2ForImageClassification",
+            "AIMv2Model",
+            "AIMv2PreTrainedModel",
+            "AIMv2TextModel",
+            "AIMv2TextModelWithProjection",
+            "AIMv2VisionModel",
+            "AIMv2VisionModelWithProjection",
+        ]
+    )
     _import_structure["models.clipseg"].extend(
         [
             "CLIPSegForImageSegmentation",
@@ -5459,6 +5475,13 @@ if TYPE_CHECKING:
         CLIPTokenizer,
         CLIPVisionConfig,
     )
+    from .models.aimv2 import (
+        AIMv2Config,
+       
+        AIMv2TextConfig,
+       
+        AIMv2VisionConfig,
+    )
     from .models.clipseg import (
         CLIPSegConfig,
         CLIPSegProcessor,
@@ -7010,6 +7033,15 @@ if TYPE_CHECKING:
             CLIPVisionModel,
             CLIPVisionModelWithProjection,
         )
+        from .models.aimv2 import (
+            AIMv2ForImageClassification,
+            AIMv2Model,
+            AIMv2PreTrainedModel,
+            AIMv2TextModel,
+            AIMv2TextModelWithProjection,
+            AIMv2VisionModel,
+            AIMv2VisionModelWithProjection,
+        )
         from .models.clipseg import (
             CLIPSegForImageSegmentation,
             CLIPSegModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 06575ffceba..0158eb42903 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -49,6 +49,7 @@ from . import (
     chinese_clip,
     clap,
     clip,
+    aimv2,
     clipseg,
     clvp,
     code_llama,
diff --git a/src/transformers/models/aimv2/__init__.py b/src/transformers/models/aimv2/__init__.py
new file mode 100644
index 00000000000..de0f553d50c
--- /dev/null
+++ b/src/transformers/models/aimv2/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_aimv2 import *
+    from .modeling_aimv2 import *
+    from .modeling_flax_aimv2 import *
+    from .modeling_tf_aimv2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
new file mode 100644
index 00000000000..cbf2ab48842
--- /dev/null
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -0,0 +1,422 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AIMv2 model configuration"""
+
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+    from ...utils import TensorType
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class AIMv2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a AIMv2
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
+    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`AIMv2Model`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            End of stream token id.
+
+    Example:
+
+    ```python
+    >>> from transformers import AIMv2TextConfig, AIMv2TextModel
+
+    >>> # Initializing a AIMv2TextConfig with apple/aimv2-large-patch14-224 style configuration
+    >>> configuration = AIMv2TextConfig()
+
+    >>> # Initializing a AIMv2TextModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
+    >>> model = AIMv2TextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "aimv2_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/aimv2
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+
+class AIMv2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
+    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
+    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import AIMv2VisionConfig, AIMv2VisionModel
+
+    >>> # Initializing a AIMv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
+    >>> configuration = AIMv2VisionConfig()
+
+    >>> # Initializing a AIMv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
+    >>> model = AIMv2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "aimv2_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+
+class AIMv2Config(PretrainedConfig):
+    r"""
+    [`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to instantiate
+    a AIMv2 model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the AIMv2
+    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AIMv2TextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AIMv2VisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original AIMv2 implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import AIMv2Config, AIMv2Model
+
+    >>> # Initializing a AIMv2Config with apple/aimv2-large-patch14-224 style configuration
+    >>> configuration = AIMv2Config()
+
+    >>> # Initializing a AIMv2Model (with random weights) from the apple/aimv2-large-patch14-224 style configuration
+    >>> model = AIMv2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a AIMv2Config from a AIMv2TextConfig and a AIMv2VisionConfig
+    >>> from transformers import AIMv2TextConfig, AIMv2VisionConfig
+
+    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
+    >>> config_text = AIMv2TextConfig()
+    >>> config_vision = AIMv2VisionConfig()
+
+    >>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "aimv2"
+    sub_configs = {"text_config": AIMv2TextConfig, "vision_config": AIMv2VisionConfig}
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = AIMv2TextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `AIMv2TextConfig`. The "
+                            f'value `text_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = AIMv2VisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `AIMv2VisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `AIMv2TextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `AIMv2VisionConfig` with default values.")
+
+        self.text_config = AIMv2TextConfig(**text_config)
+        self.vision_config = AIMv2VisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: AIMv2TextConfig, vision_config: AIMv2VisionConfig, **kwargs):
+        r"""
+        Instantiate a [`AIMv2Config`] (or a derived class) from aimv2 text model configuration and aimv2 vision model
+        configuration.
+
+        Returns:
+            [`AIMv2Config`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+
+class AIMv2OnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.image_processor, batch_size=batch_size, framework=framework
+        )
+        return {**text_input_dict, **image_input_dict}
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14
+
+
+__all__ = ["AIMv2Config", "AIMv2OnnxConfig", "AIMv2TextConfig", "AIMv2VisionConfig"]
diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
new file mode 100644
index 00000000000..db2a7285871
--- /dev/null
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -0,0 +1,156 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+from aimv2 import load
+
+from transformers import AIMv2Config, AIMv2Model
+
+
+def copy_attn_layer(hf_attn_layer, pt_attn_layer):
+    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
+    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
+
+    out_proj_weights = pt_attn_layer.out_proj.weight
+    out_proj_bias = pt_attn_layer.out_proj.bias
+
+    hf_attn_layer.q_proj.weight.data = q_proj
+    hf_attn_layer.q_proj.bias.data = q_proj_bias
+
+    hf_attn_layer.k_proj.weight.data = k_proj
+    hf_attn_layer.k_proj.bias.data = k_proj_bias
+
+    hf_attn_layer.v_proj.weight.data = v_proj
+    hf_attn_layer.v_proj.bias.data = v_proj_bias
+
+    hf_attn_layer.out_proj.weight = out_proj_weights
+    hf_attn_layer.out_proj.bias = out_proj_bias
+
+
+def copy_mlp(hf_mlp, pt_mlp):
+    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
+    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
+
+
+def copy_linear(hf_linear, pt_linear):
+    hf_linear.weight = pt_linear.weight
+    hf_linear.bias = pt_linear.bias
+
+
+def copy_layer(hf_layer, pt_layer):
+    # copy layer norms
+    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
+    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
+
+    # copy MLP
+    copy_mlp(hf_layer.mlp, pt_layer.mlp)
+
+    # copy attn
+    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
+
+
+def copy_layers(hf_layers, pt_layers):
+    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
+        copy_layer(hf_layer, pt_layer)
+
+
+def copy_encoder(hf_encoder, pt_model):
+    # copy  embeds
+    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
+    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
+
+    # copy layer norm
+    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
+
+    # copy hidden layers
+    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
+
+
+def copy_text_model_and_projection(hf_model, pt_model):
+    # copy projection
+    hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous()
+
+    # copy text encoder
+    copy_encoder(hf_model.text_model, pt_model)
+
+
+def copy_vison_model_and_projection(hf_model, pt_model):
+    # copy projection
+    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous()
+
+    # copy layer norms
+    copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
+    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
+
+    # copy embeds
+    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
+    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
+    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
+
+    # copy encoder
+    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
+
+
+@torch.no_grad()
+def convert_aimv2_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = AIMv2Config.from_pretrained(config_path)
+    else:
+        config = AIMv2Config(projection_dim=512, text_config={}, vision_config={})
+
+    hf_model = AIMv2Model(config).eval()
+
+    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
+    pt_model = pt_model.eval()
+
+    copy_text_model_and_projection(hf_model, pt_model)
+    copy_vison_model_and_projection(hf_model, pt_model)
+    hf_model.logit_scale = pt_model.logit_scale
+
+    # Use `eos_token` so the example is more meaningful
+    input_ids = torch.tensor(
+        [
+            [config.text_config.bos_token_id]
+            + list(range(3, 77))
+            + [config.text_config.eos_token_id]
+            + [config.text_config.pad_token_id]
+        ]
+    )
+    pixel_values = torch.randn(1, 3, 224, 224)
+
+    hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
+    hf_logits_per_image = hf_outputs.logits_per_image
+    hf_logits_per_text = hf_outputs.logits_per_text
+    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
+
+    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
+    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
+
+    hf_model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    args = parser.parse_args()
+
+    convert_aimv2_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 712450e166a..36c84eac8d1 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -64,6 +64,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
         ("chinese_clip_vision_model", "ChineseCLIPVisionConfig"),
         ("clap", "ClapConfig"),
         ("clip", "CLIPConfig"),
+        ("aimv2", "AIMv2Config"),
         ("clip_text_model", "CLIPTextConfig"),
         ("clip_vision_model", "CLIPVisionConfig"),
         ("clipseg", "CLIPSegConfig"),
@@ -396,6 +397,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
         ("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
         ("clap", "CLAP"),
         ("clip", "CLIP"),
+        ("aimv2", "AIMv2"),
         ("clip_text_model", "CLIPTextModel"),
         ("clip_vision_model", "CLIPVisionModel"),
         ("clipseg", "CLIPSeg"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 134571014f9..9474de57501 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -44,6 +44,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
         ("chinese_clip", "ChineseCLIPFeatureExtractor"),
         ("clap", "ClapFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
+        ("aimv2", "AIMv2FeatureExtractor"),
         ("clipseg", "ViTFeatureExtractor"),
         ("clvp", "ClvpFeatureExtractor"),
         ("conditional_detr", "ConditionalDetrFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 7bf2180b2d8..77cdc245ee4 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -64,6 +64,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
         ("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
         ("clap", "ClapModel"),
         ("clip", "CLIPModel"),
+        ("aimv2", "AIMv2Model"),
         ("clip_text_model", "CLIPTextModel"),
         ("clip_vision_model", "CLIPVisionModel"),
         ("clipseg", "CLIPSegModel"),
@@ -681,6 +682,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
         ("beit", "BeitForImageClassification"),
         ("bit", "BitForImageClassification"),
         ("clip", "CLIPForImageClassification"),
+        ("aimv2", "AIMv2ForImageClassification"),
         ("convnext", "ConvNextForImageClassification"),
         ("convnextv2", "ConvNextV2ForImageClassification"),
         ("cvt", "CvtForImageClassification"),
@@ -1416,6 +1418,7 @@ MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
         ("blip-2", "Blip2ForImageTextRetrieval"),
         ("chinese_clip", "ChineseCLIPModel"),
         ("clip", "CLIPModel"),
+        ("aimv2", "AIMv2Model"),
         ("clipseg", "CLIPSegModel"),
         ("siglip", "SiglipModel"),
         ("siglip2", "Siglip2Model"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 5b699a4a44d..816993a7a5a 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -57,6 +57,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
         ("chinese_clip", "ChineseCLIPProcessor"),
         ("clap", "ClapProcessor"),
         ("clip", "CLIPProcessor"),
+        ("aimv2", "AIMv2Processor"),
         ("clipseg", "CLIPSegProcessor"),
         ("clvp", "ClvpProcessor"),
         ("colpali", "ColPaliProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 13f9a8a4297..f7112b2f18b 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -131,6 +131,13 @@ else:
                     "CLIPTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "aimv2",
+                (
+                    "CLIPTokenizer",
+                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             (
                 "clipseg",
                 (
diff --git a/tests/models/aimv2/__init__.py b/tests/models/aimv2/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
new file mode 100644
index 00000000000..296c1811729
--- /dev/null
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -0,0 +1,1140 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch AIMv2 model."""
+
+import inspect
+import os
+import tempfile
+import unittest
+from typing import Optional, Tuple
+
+import numpy as np
+import requests
+from parameterized import parameterized
+from pytest import mark
+
+from transformers import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
+from transformers.testing_utils import (
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    require_torch_sdpa,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import (
+    is_torch_available,
+    is_torch_bf16_available_on_device,
+    is_torch_fp16_available_on_device,
+    is_torch_sdpa_available,
+    is_vision_available,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    is_flaky,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import (
+        AIMv2ForImageClassification,
+        AIMv2Model,
+        AIMv2TextModel,
+        AIMv2TextModelWithProjection,
+        AIMv2VisionModel,
+        AIMv2VisionModelWithProjection,
+    )
+
+
+if is_torch_sdpa_available():
+    from torch.nn.attention import SDPBackend, sdpa_kernel
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import CLIPProcessor
+
+
+class AIMv2VisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return AIMv2VisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = AIMv2VisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_with_projection(self, config, pixel_values):
+        model = AIMv2VisionModelWithProjection(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+class AIMv2ModelTesterMixin(ModelTesterMixin):
+    """
+    Subclass of ModelTesterMixin with methods specific to testing AIMv2 models.
+    The SDPA equivalence test is overridden here because AIMv2 models may have test/vision/text+vision inputs,
+    different output logits, and are not supposed to be used or tested with padding_side="left".
+    """
+
+    def test_sdpa_can_dispatch_composite_models(self):
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # Load the model with SDPA
+                model_sdpa = model_class.from_pretrained(tmpdirname)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                # Load model with eager attention
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    attn_implementation="eager",
+                )
+                model_eager = model_eager.eval().to(torch_device)
+
+            # SigLip has one shared cls attr for all models, so we assign both submodels heer
+            vision_attn = text_attn = "sdpa" if model._supports_sdpa else "eager"
+
+            # `None` as it is the requested one which will be assigned to each sub-config
+            # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
+            if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "text_model"):
+                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
+                self.assertTrue(model_sdpa.text_model.config._attn_implementation == text_attn)
+                self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
+                self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
+
+            self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+            self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+            for name, submodule in model_eager.named_modules():
+                class_name = submodule.__class__.__name__
+                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                    raise ValueError("The eager model should not have SDPA attention layers")
+
+            has_sdpa = False
+            for name, submodule in model_sdpa.named_modules():
+                class_name = submodule.__class__.__name__
+                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                    has_sdpa = True
+                    break
+            if not has_sdpa and model_sdpa.config.model_type != "falcon":
+                raise ValueError("The SDPA model should have SDPA attention layers")
+
+    def test_eager_matches_sdpa_inference(
+        self,
+        torch_dtype: str,
+        use_attention_mask_options: Tuple[Optional[str], ...] = (None, "left", "right"),
+        logit_keys: Tuple[str, ...] = ("logits_per_image", "logits_per_text", "image_embeds", "text_embeds"),
+    ):
+        if not self.all_model_classes[0]._supports_sdpa:
+            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
+
+        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
+            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
+
+        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
+            self.skipTest(
+                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
+            )
+
+        # Convert to torch dtype
+        dtypes = {
+            "float16": torch.float16,
+            "bfloat16": torch.bfloat16,
+            "float32": torch.float32,
+        }
+        torch_dtype = dtypes[torch_dtype]
+
+        atols = {
+            torch.float32: 1e-5,
+            torch.bfloat16: 3e-2,
+            torch.float16: 5e-3,
+        }
+        rtols = {
+            torch.float32: 1e-4,
+            torch.bfloat16: 3e-2,
+            torch.float16: 5e-3,
+        }
+
+        atol = atols[torch_dtype]
+        rtol = rtols[torch_dtype]
+
+        def get_mean_reldiff(msg, current_case, x, ref, atol, rtol):
+            return f"{msg} {current_case}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # Load the model with SDPA
+                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                # Load model with eager attention
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch_dtype,
+                    attn_implementation="eager",
+                )
+                model_eager = model_eager.eval().to(torch_device)
+
+            # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving the model each time,
+            # but it would be nicer to have an efficient way to use parameterized.expand
+            cases = [
+                (use_mask, output_attentions, sdpa_backend, batch_size)
+                for use_mask in use_attention_mask_options
+                for output_attentions in [True, False]
+                for sdpa_backend in [
+                    [SDPBackend.MATH],
+                    [SDPBackend.FLASH_ATTENTION, SDPBackend.MATH],
+                    [SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH],
+                    [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH],
+                ]
+                for batch_size in [1, 5]
+            ]
+            fail_cases = []
+
+            for use_mask, output_attentions, sdpa_backend, batch_size in cases:
+                processed_inputs = inputs_dict.copy()
+
+                # convert to torch_dtype
+                if "pixel_values" in processed_inputs:
+                    processed_inputs["pixel_values"] = processed_inputs["pixel_values"].to(torch_dtype)
+
+                # slice for different batch sizes
+                for key in ["pixel_values", "input_ids", "attention_mask"]:
+                    if key in processed_inputs:
+                        processed_inputs[key] = processed_inputs[key][:batch_size]
+
+                # set attention mask with left padding
+                if not use_mask:
+                    processed_inputs.pop("attention_mask", None)
+                elif use_mask == "left":
+                    dummy_attention_mask = processed_inputs["attention_mask"]
+                    dummy_attention_mask[:] = 1
+                    dummy_attention_mask[:, :1] = 0
+                    processed_inputs["attention_mask"] = dummy_attention_mask
+                elif use_mask == "right":
+                    dummy_attention_mask = processed_inputs["attention_mask"]
+                    dummy_attention_mask[:] = 1
+                    dummy_attention_mask[:, -1:] = 0
+                    processed_inputs["attention_mask"] = dummy_attention_mask
+                else:
+                    raise ValueError(f"Invalid value for use_mask={use_mask}")
+
+                processed_inputs["output_attentions"] = output_attentions
+                processed_inputs["output_hidden_states"] = True
+
+                current_case = f"use_mask={use_mask}, batch_size={batch_size}, sdpa_backend={sdpa_backend}"
+
+                prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
+
+                with torch.no_grad():
+                    try:
+                        with sdpa_kernel(sdpa_backend):
+                            outputs_eager = model_eager(**prepared_inputs)
+                            outputs_sdpa = model_sdpa(**prepared_inputs)
+                    except Exception as e:
+                        fail_cases.append(f"{current_case}: {e}")
+                        continue
+
+                keys = set(logit_keys) & set(outputs_eager.keys())
+                self.assertTrue(
+                    keys, f"Keys {logit_keys} not found in outputs. Available keys: {outputs_eager.keys()}"
+                )
+
+                for key in keys:
+                    try:
+                        eager_logits = outputs_eager[key]
+                        sdpa_logits = outputs_sdpa[key]
+                    except KeyError:
+                        raise KeyError(f"Key {key} not found in outputs. Available keys: {outputs_eager.keys()}")
+
+                    if "hidden_state" in key and use_mask == "left":
+                        eager_logits = eager_logits[:, 1:]
+                        sdpa_logits = sdpa_logits[:, 1:]
+                    elif "hidden_state" in key and use_mask == "right":
+                        eager_logits = eager_logits[:, :-1]
+                        sdpa_logits = sdpa_logits[:, :-1]
+
+                    is_close = torch.allclose(eager_logits, sdpa_logits, atol=atol, rtol=rtol)
+                    if not is_close:
+                        fail_cases.append(get_mean_reldiff(key, current_case, sdpa_logits, eager_logits, atol, rtol))
+
+            self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
+
+
+@require_torch
+class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as AIMv2 does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (AIMv2VisionModel, AIMv2VisionModelWithProjection) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = AIMv2VisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AIMv2VisionConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="AIMv2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_projection(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
+
+    @unittest.skip
+    def test_training(self):
+        pass
+
+    @unittest.skip
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="AIMv2VisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="AIMv2VisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "apple/aimv2-large-patch14-224"
+        model = AIMv2VisionModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+    @slow
+    def test_model_with_projection_from_pretrained(self):
+        model_name = "apple/aimv2-large-patch14-224"
+        model = AIMv2VisionModelWithProjection.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+        self.assertTrue(hasattr(model, "visual_projection"))
+
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        super().test_eager_matches_sdpa_inference(
+            torch_dtype=torch_dtype,
+            logit_keys=("last_hidden_state", "pooler_output", "image_embeds"),
+            use_attention_mask_options=(None,),
+        )
+
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        super().test_sdpa_can_dispatch_composite_models()
+
+
+class AIMv2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return AIMv2TextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = AIMv2TextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_with_projection(self, config, input_ids, input_mask):
+        model = AIMv2TextModelWithProjection(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (AIMv2TextModel, AIMv2TextModelWithProjection) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_head_masking = False
+    model_split_percents = [0.5, 0.8, 0.9]
+
+    def setUp(self):
+        self.model_tester = AIMv2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AIMv2TextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_with_projection(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
+
+    @unittest.skip
+    def test_training(self):
+        pass
+
+    @unittest.skip
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="AIMv2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="AIMv2TextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="AIMv2TextModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "apple/aimv2-large-patch14-224"
+        model = AIMv2TextModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+    @slow
+    def test_model_with_projection_from_pretrained(self):
+        model_name = "apple/aimv2-large-patch14-224"
+        model = AIMv2TextModelWithProjection.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+        self.assertTrue(hasattr(model, "text_projection"))
+
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        super().test_eager_matches_sdpa_inference(
+            torch_dtype=torch_dtype,
+            logit_keys=("last_hidden_state", "pooler_output", "text_embeds"),
+            use_attention_mask_options=(None, "right"),  # "left" is not supported for text model
+        )
+
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        super().test_sdpa_can_dispatch_composite_models()
+
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_on_flash(self):
+        self.skipTest(reason="AIMv2TextModel has two attention masks: `causal_attention_mask` and `attention_mask`")
+
+
+class AIMv2ModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = AIMv2TextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = AIMv2VisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return AIMv2Config.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = AIMv2Model(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (AIMv2Model,) if is_torch_available() else ()
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = AIMv2ModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=AIMv2Config, has_text_modality=False, common_properties=common_properties
+        )
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="AIMv2Model does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    # override as the `logit_scale` parameter initilization is different for AIMv2
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initilized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            self.skipTest(reason="test_torchscript is set to False")
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # AIMv2 needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save AIMv2Config and check if we can load AIMv2VisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = AIMv2VisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save AIMv2Config and check if we can load AIMv2TextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = AIMv2TextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "apple/aimv2-large-patch14-224"
+        model = AIMv2Model.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        super().test_eager_matches_sdpa_inference(
+            torch_dtype=torch_dtype,
+            logit_keys=("logits_per_image", "logits_per_text"),
+            use_attention_mask_options=(None, "right"),  # "left" is not supported for text model
+        )
+
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        super().test_sdpa_can_dispatch_composite_models()
+
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_on_flash(self):
+        self.skipTest(reason="AIMv2 text tower has two attention masks: `causal_attention_mask` and `attention_mask`")
+
+    @require_torch_sdpa
+    def test_sdpa_can_compile_dynamic(self):
+        self.skipTest(reason="AIMv2 model can't be compiled dynamic, error in aimv2_loss`")
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_equivalence(self):
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
+                model.to(torch_device)
+
+                dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16)
+                dummy_input_ids = inputs_dict["input_ids"]
+
+                outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True)
+                outputs_fa = model_fa(
+                    pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True
+                )
+
+                self.assertTrue(
+                    torch.allclose(outputs.logits_per_image, outputs_fa.logits_per_image, atol=4e-2, rtol=4e-2),
+                    f"Image logits max diff: {torch.max(torch.abs(outputs.logits_per_image - outputs_fa.logits_per_image))}",
+                )
+                self.assertTrue(
+                    torch.allclose(outputs.logits_per_text, outputs_fa.logits_per_text, atol=4e-2, rtol=4e-2),
+                    f"Text logits max diff: {torch.max(torch.abs(outputs.logits_per_text - outputs_fa.logits_per_text))}",
+                )
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="eager"
+                )
+                model.to(torch_device)
+
+                dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16)
+                dummy_input_ids = inputs_dict["input_ids"]
+                dummy_pixel_mask = inputs_dict["attention_mask"]
+
+                # right padding
+                dummy_pixel_mask[:] = 1
+                dummy_pixel_mask[:, -1:] = 0
+
+                outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True)
+                outputs_fa = model_fa(
+                    pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True
+                )
+
+                logits_per_image_eager = outputs.logits_per_image[:, :-1]
+                logits_per_text_eager = outputs.logits_per_text[:, :-1]
+
+                logits_per_image_sdpa = outputs_fa.logits_per_image[:, :-1]
+                logits_per_text_sdpa = outputs_fa.logits_per_text[:, :-1]
+
+                self.assertTrue(
+                    torch.allclose(logits_per_image_eager, logits_per_image_sdpa, atol=4e-2, rtol=4e-2),
+                    f"Image logits max diff: {torch.max(torch.abs(logits_per_image_eager - logits_per_image_sdpa))}",
+                )
+                self.assertTrue(
+                    torch.allclose(logits_per_text_eager, logits_per_text_sdpa, atol=4e-2, rtol=4e-2),
+                    f"Text logits max diff: {torch.max(torch.abs(logits_per_text_eager - logits_per_text_sdpa))}",
+                )
+
+
+class AIMv2ForImageClassificationModelTester(AIMv2ModelTester):
+    def __init__(self, parent):
+        super().__init__(parent)
+        self.batch_size = self.vision_model_tester.batch_size
+        self.num_hidden_layers = self.vision_model_tester.num_hidden_layers
+        self.hidden_size = self.vision_model_tester.hidden_size
+        self.seq_length = self.vision_model_tester.seq_length
+
+    def prepare_config_and_inputs(self):
+        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class AIMv2ForImageClassificationModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (AIMv2ForImageClassification,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-classification": AIMv2ForImageClassification} if is_torch_available() else {}
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = AIMv2ForImageClassificationModelTester(self)
+
+    @unittest.skip(reason="AIMv2ForImageClassification does not support inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="AIMv2ForImageClassification does not support inputs_embeds")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip(reason="AIMv2ForImageClassification does not support gradient checkpointing yet")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="AIMv2ForImageClassification does not support gradient checkpointing yet")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="AIMv2ForImageClassification does not support gradient checkpointing yet")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="AIMv2 uses the same initialization scheme as the Flax original implementation")
+    def test_initialization(self):
+        pass
+
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        super().test_eager_matches_sdpa_inference(
+            torch_dtype=torch_dtype,
+            logit_keys=("logits",),
+            use_attention_mask_options=(None,),
+        )
+
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        super().test_sdpa_can_dispatch_composite_models()
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@require_vision
+@require_torch
+class AIMv2ModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "apple/aimv2-large-patch14-224"
+        model = AIMv2Model.from_pretrained(model_name).to(torch_device)
+        processor = CLIPProcessor.from_pretrained(model_name)
+
+        image = prepare_img()
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
+        ).to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+
+        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
+
+        torch.testing.assert_close(outputs.logits_per_image, expected_logits, rtol=1e-3, atol=1e-3)
+
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        # AIMv2 models have an `interpolate_pos_encoding` argument in their forward method,
+        # allowing to interpolate the pre-trained position embeddings in order to use
+        # the model on higher resolutions. The DINO model by Facebook AI leverages this
+        # to visualize self-attention on higher resolution images.
+        model = AIMv2Model.from_pretrained("apple/aimv2-large-patch14-224").to(torch_device)
+
+        processor = CLIPProcessor.from_pretrained(
+            "apple/aimv2-large-patch14-224", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
+        )
+
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
+
+        # interpolate_pos_encodiung false should return value error
+        with self.assertRaises(ValueError, msg="doesn't match model"):
+            with torch.no_grad():
+                model(**inputs, interpolate_pos_encoding=False)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs, interpolate_pos_encoding=True)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 26, 768))
+
+        self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]]
+        ).to(torch_device)
+
+        torch.testing.assert_close(
+            outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4
+        )

From c63b292a0ca6590c6d0ec268535242fdb1f31327 Mon Sep 17 00:00:00 2001
From: yaswanth <yaswanthgali8@gmail.com>
Date: Sat, 8 Mar 2025 19:20:17 +0530
Subject: [PATCH 02/62] changes

---
 src/transformers/__init__.py                  |  30 +-
 src/transformers/models/__init__.py           |   2 +-
 src/transformers/models/aimv2/__init__.py     |   2 -
 .../models/aimv2/configuration_aimv2.py       | 412 ++----------------
 .../models/aimv2/modeling_aimv2.py            | 332 ++++++++++++++
 .../models/aimv2/modular_aimv2.py             | 144 ++++++
 .../models/auto/configuration_auto.py         |   4 +-
 .../models/auto/feature_extraction_auto.py    |   2 +-
 src/transformers/models/auto/modeling_auto.py |   6 +-
 .../models/auto/processing_auto.py            |   2 +-
 .../models/auto/tokenization_auto.py          |  14 +-
 tests/models/aimv2/test_modeling_aimv2.py     |   8 +-
 12 files changed, 549 insertions(+), 409 deletions(-)
 create mode 100644 src/transformers/models/aimv2/modular_aimv2.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 321580896d7..4a3a5c3e1db 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -5338,6 +5338,11 @@ if TYPE_CHECKING:
         load_tf2_model_in_pytorch_model,
         load_tf2_weights_in_pytorch_model,
     )
+    from .models.aimv2 import (
+        AIMv2Config,
+        AIMv2TextConfig,
+        AIMv2VisionConfig,
+    )
     from .models.albert import AlbertConfig
     from .models.align import (
         AlignConfig,
@@ -5475,13 +5480,6 @@ if TYPE_CHECKING:
         CLIPTokenizer,
         CLIPVisionConfig,
     )
-    from .models.aimv2 import (
-        AIMv2Config,
-       
-        AIMv2TextConfig,
-       
-        AIMv2VisionConfig,
-    )
     from .models.clipseg import (
         CLIPSegConfig,
         CLIPSegProcessor,
@@ -6724,6 +6722,15 @@ if TYPE_CHECKING:
         )
         from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
         from .modeling_utils import PreTrainedModel
+        from .models.aimv2 import (
+            AIMv2ForImageClassification,
+            AIMv2Model,
+            AIMv2PreTrainedModel,
+            AIMv2TextModel,
+            AIMv2TextModelWithProjection,
+            AIMv2VisionModel,
+            AIMv2VisionModelWithProjection,
+        )
         from .models.albert import (
             AlbertForMaskedLM,
             AlbertForMultipleChoice,
@@ -7033,15 +7040,6 @@ if TYPE_CHECKING:
             CLIPVisionModel,
             CLIPVisionModelWithProjection,
         )
-        from .models.aimv2 import (
-            AIMv2ForImageClassification,
-            AIMv2Model,
-            AIMv2PreTrainedModel,
-            AIMv2TextModel,
-            AIMv2TextModelWithProjection,
-            AIMv2VisionModel,
-            AIMv2VisionModelWithProjection,
-        )
         from .models.clipseg import (
             CLIPSegForImageSegmentation,
             CLIPSegModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 0158eb42903..e853eee15b0 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from . import (
+    aimv2,
     albert,
     align,
     altclip,
@@ -49,7 +50,6 @@ from . import (
     chinese_clip,
     clap,
     clip,
-    aimv2,
     clipseg,
     clvp,
     code_llama,
diff --git a/src/transformers/models/aimv2/__init__.py b/src/transformers/models/aimv2/__init__.py
index de0f553d50c..c0133639960 100644
--- a/src/transformers/models/aimv2/__init__.py
+++ b/src/transformers/models/aimv2/__init__.py
@@ -20,8 +20,6 @@ from ...utils.import_utils import define_import_structure
 if TYPE_CHECKING:
     from .configuration_aimv2 import *
     from .modeling_aimv2 import *
-    from .modeling_flax_aimv2 import *
-    from .modeling_tf_aimv2 import *
 else:
     import sys
 
diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index cbf2ab48842..119df3c0892 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -29,394 +29,58 @@ from ...utils import logging
 
 logger = logging.get_logger(__name__)
 
-
-class AIMv2TextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a AIMv2
-    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
-    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
+class AIMv2Config(PretrainedConfig):
+    """This is the configuration class to store the configuration of an [`AIMv2Model`].
+    Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
     Args:
-        vocab_size (`int`, *optional*, defaults to 49408):
-            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`AIMv2Model`].
-        hidden_size (`int`, *optional*, defaults to 512):
-            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        projection_dim (`int`, *optional*, defaults to 512):
-            Dimensionality of text and vision projection layers.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 8):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        max_position_embeddings (`int`, *optional*, defaults to 77):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the layer normalization layers.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1.0):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
-        pad_token_id (`int`, *optional*, defaults to 1):
-            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 49406):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 49407):
-            End of stream token id.
+        hidden_size: Dimension of the hidden representations.
+        intermediate_size: Dimension of the SwiGLU representations.
+        num_hidden_layers: Number of hidden layers in the Transformer.
+        num_attention_heads: Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels: Number of input channels.
+        image_size: Image size.
+        patch_size: Patch size.
+        rms_norm_eps: Epsilon value used for the RMS normalization layer.
+        attention_dropout: Dropout ratio for attention probabilities.
+        projection_dropout: Dropout ratio for the projection layer after the attention.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        use_bias: Whether to add a bias in the feed-forward and projection layers.
+        kwargs: Keyword arguments for the [`PretrainedConfig`].
+    """
 
-    Example:
-
-    ```python
-    >>> from transformers import AIMv2TextConfig, AIMv2TextModel
-
-    >>> # Initializing a AIMv2TextConfig with apple/aimv2-large-patch14-224 style configuration
-    >>> configuration = AIMv2TextConfig()
-
-    >>> # Initializing a AIMv2TextModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
-    >>> model = AIMv2TextModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "aimv2_text_model"
-    base_config_key = "text_config"
+    model_type: str = "aimv2"
 
     def __init__(
         self,
-        vocab_size=49408,
-        hidden_size=512,
-        intermediate_size=2048,
-        projection_dim=512,
-        num_hidden_layers=12,
-        num_attention_heads=8,
-        max_position_embeddings=77,
-        hidden_act="quick_gelu",
-        layer_norm_eps=1e-5,
-        attention_dropout=0.0,
-        initializer_range=0.02,
-        initializer_factor=1.0,
-        # This differs from `CLIPTokenizer`'s default and from openai/aimv2
-        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
-        pad_token_id=1,
-        bos_token_id=49406,
-        eos_token_id=49407,
-        **kwargs,
-    ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.max_position_embeddings = max_position_embeddings
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
-        self.attention_dropout = attention_dropout
-
-
-class AIMv2VisionConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
-    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
-    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        projection_dim (`int`, *optional*, defaults to 512):
-            Dimensionality of text and vision projection layers.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 32):
-            The size (resolution) of each patch.
-        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the layer normalization layers.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1.0):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
-
-    Example:
-
-    ```python
-    >>> from transformers import AIMv2VisionConfig, AIMv2VisionModel
-
-    >>> # Initializing a AIMv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
-    >>> configuration = AIMv2VisionConfig()
-
-    >>> # Initializing a AIMv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
-    >>> model = AIMv2VisionModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "aimv2_vision_model"
-    base_config_key = "vision_config"
-
-    def __init__(
-        self,
-        hidden_size=768,
-        intermediate_size=3072,
-        projection_dim=512,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        num_channels=3,
-        image_size=224,
-        patch_size=32,
-        hidden_act="quick_gelu",
-        layer_norm_eps=1e-5,
-        attention_dropout=0.0,
-        initializer_range=0.02,
-        initializer_factor=1.0,
-        **kwargs,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        **kwargs: Any,
     ):
         super().__init__(**kwargs)
-
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
-        self.projection_dim = projection_dim
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.num_channels = num_channels
         self.patch_size = patch_size
         self.image_size = image_size
-        self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
+        self.rms_norm_eps = rms_norm_eps
 
+        self.projection_dropout = projection_dropout
+        self.qkv_bias = qkv_bias
+        self.use_bias = use_bias
 
-class AIMv2Config(PretrainedConfig):
-    r"""
-    [`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to instantiate
-    a AIMv2 model according to the specified arguments, defining the text model and vision model configs. Instantiating
-    a configuration with the defaults will yield a similar configuration to that of the AIMv2
-    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`AIMv2TextConfig`].
-        vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`AIMv2VisionConfig`].
-        projection_dim (`int`, *optional*, defaults to 512):
-            Dimensionality of text and vision projection layers.
-        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The initial value of the *logit_scale* parameter. Default is used as per the original AIMv2 implementation.
-        kwargs (*optional*):
-            Dictionary of keyword arguments.
-
-    Example:
-
-    ```python
-    >>> from transformers import AIMv2Config, AIMv2Model
-
-    >>> # Initializing a AIMv2Config with apple/aimv2-large-patch14-224 style configuration
-    >>> configuration = AIMv2Config()
-
-    >>> # Initializing a AIMv2Model (with random weights) from the apple/aimv2-large-patch14-224 style configuration
-    >>> model = AIMv2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-
-    >>> # We can also initialize a AIMv2Config from a AIMv2TextConfig and a AIMv2VisionConfig
-    >>> from transformers import AIMv2TextConfig, AIMv2VisionConfig
-
-    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
-    >>> config_text = AIMv2TextConfig()
-    >>> config_vision = AIMv2VisionConfig()
-
-    >>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
-    ```"""
-
-    model_type = "aimv2"
-    sub_configs = {"text_config": AIMv2TextConfig, "vision_config": AIMv2VisionConfig}
-
-    def __init__(
-        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
-    ):
-        # If `_config_dict` exist, we use them for the backward compatibility.
-        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
-        # of confusion!).
-        text_config_dict = kwargs.pop("text_config_dict", None)
-        vision_config_dict = kwargs.pop("vision_config_dict", None)
-
-        super().__init__(**kwargs)
-
-        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
-        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
-        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
-        if text_config_dict is not None:
-            if text_config is None:
-                text_config = {}
-
-            # This is the complete result when using `text_config_dict`.
-            _text_config_dict = AIMv2TextConfig(**text_config_dict).to_dict()
-
-            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
-            for key, value in _text_config_dict.items():
-                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
-                    # If specified in `text_config_dict`
-                    if key in text_config_dict:
-                        message = (
-                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
-                            f'The value `text_config_dict["{key}"]` will be used instead.'
-                        )
-                    # If inferred from default argument values (just to be super careful)
-                    else:
-                        message = (
-                            f"`text_config_dict` is provided which will be used to initialize `AIMv2TextConfig`. The "
-                            f'value `text_config["{key}"]` will be overridden.'
-                        )
-                    logger.info(message)
-
-            # Update all values in `text_config` with the ones in `_text_config_dict`.
-            text_config.update(_text_config_dict)
-
-        if vision_config_dict is not None:
-            if vision_config is None:
-                vision_config = {}
-
-            # This is the complete result when using `vision_config_dict`.
-            _vision_config_dict = AIMv2VisionConfig(**vision_config_dict).to_dict()
-            # convert keys to string instead of integer
-            if "id2label" in _vision_config_dict:
-                _vision_config_dict["id2label"] = {
-                    str(key): value for key, value in _vision_config_dict["id2label"].items()
-                }
-
-            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
-            for key, value in _vision_config_dict.items():
-                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
-                    # If specified in `vision_config_dict`
-                    if key in vision_config_dict:
-                        message = (
-                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
-                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
-                        )
-                    # If inferred from default argument values (just to be super careful)
-                    else:
-                        message = (
-                            f"`vision_config_dict` is provided which will be used to initialize `AIMv2VisionConfig`. "
-                            f'The value `vision_config["{key}"]` will be overridden.'
-                        )
-                    logger.info(message)
-
-            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
-            vision_config.update(_vision_config_dict)
-
-        if text_config is None:
-            text_config = {}
-            logger.info("`text_config` is `None`. Initializing the `AIMv2TextConfig` with default values.")
-
-        if vision_config is None:
-            vision_config = {}
-            logger.info("`vision_config` is `None`. initializing the `AIMv2VisionConfig` with default values.")
-
-        self.text_config = AIMv2TextConfig(**text_config)
-        self.vision_config = AIMv2VisionConfig(**vision_config)
-
-        self.projection_dim = projection_dim
-        self.logit_scale_init_value = logit_scale_init_value
-        self.initializer_factor = 1.0
-
-    @classmethod
-    def from_text_vision_configs(cls, text_config: AIMv2TextConfig, vision_config: AIMv2VisionConfig, **kwargs):
-        r"""
-        Instantiate a [`AIMv2Config`] (or a derived class) from aimv2 text model configuration and aimv2 vision model
-        configuration.
-
-        Returns:
-            [`AIMv2Config`]: An instance of a configuration object
-        """
-
-        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-
-class AIMv2OnnxConfig(OnnxConfig):
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("input_ids", {0: "batch", 1: "sequence"}),
-                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
-                ("attention_mask", {0: "batch", 1: "sequence"}),
-            ]
-        )
-
-    @property
-    def outputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict(
-            [
-                ("logits_per_image", {0: "batch"}),
-                ("logits_per_text", {0: "batch"}),
-                ("text_embeds", {0: "batch"}),
-                ("image_embeds", {0: "batch"}),
-            ]
-        )
-
-    @property
-    def atol_for_validation(self) -> float:
-        return 1e-4
-
-    def generate_dummy_inputs(
-        self,
-        processor: "ProcessorMixin",
-        batch_size: int = -1,
-        seq_length: int = -1,
-        framework: Optional["TensorType"] = None,
-    ) -> Mapping[str, Any]:
-        text_input_dict = super().generate_dummy_inputs(
-            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
-        )
-        image_input_dict = super().generate_dummy_inputs(
-            processor.image_processor, batch_size=batch_size, framework=framework
-        )
-        return {**text_input_dict, **image_input_dict}
-
-    @property
-    def default_onnx_opset(self) -> int:
-        return 14
-
-
-__all__ = ["AIMv2Config", "AIMv2OnnxConfig", "AIMv2TextConfig", "AIMv2VisionConfig"]
+__all__ = ["AIMv2Config"]
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index e69de29bb2d..7b9010d7828 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -0,0 +1,332 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/aimv2/modular_aimv2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_aimv2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Apple Inc. and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.modeling_utils import PreTrainedModel
+
+from ...activations import ACT2FN
+from .configuration_aimv2 import AIMv2Config
+
+
+class AIMv2PreTrainedModel(PreTrainedModel):
+    pass
+
+
+class AIMv2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        AIMv2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class AIMv2SwiGLUFFN(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        in_features = config.hidden_size
+        out_features = config.intermediate_size
+        self.act_fn = config.hidden_act
+
+        self.fc1 = nn.Linear(in_features, out_features, bias=config.use_bias)
+        self.fc2 = nn.Linear(out_features, in_features, bias=config.use_bias)
+        self.fc3 = nn.Linear(in_features, out_features, bias=config.use_bias)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        fc3_out = self.fc3(hidden_states)
+        fc1_out = self.fc1(hidden_states)
+        hidden_states = ACT2FN[self.act_fn](fc1_out) * fc3_out
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class AIMv2Embeddings(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        self.patch_embed = nn.Conv2d(
+            config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
+        )
+        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+
+        num_patches = (config.image_size // config.patch_size) ** 2
+        self.position_embeddings = nn.Embedding(num_patches, config.hidden_size)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(height, width, embed_dim):
+        pass
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2)
+        hidden_states = self.norm(hidden_states)
+
+        _, num_patches, _ = hidden_states.size()
+
+        # added logic for native in build s2d sincos pos embed
+        hidden_states = hidden_states + self.position_embeddings
+
+        return hidden_states
+
+
+class AIMv2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class AIMv2EncoderLayer(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        self.attention = AIMv2Attention(config)
+        self.ffn = AIMv2SwiGLUFFN(config)
+        self.rms_norm1 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+
+    def forward(
+        self, hidden_states: torch.Tensor, attention_mask, output_attention: Optional[bool] = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        norm_hidden_states = self.rms_norm1(hidden_states)
+        attn_output, attn_wights = self.attention(
+            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attention=output_attention
+        )
+
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.rms_norm2(hidden_states)
+        mlp_output = self.ffn(norm_hidden_states)
+
+        hidden_states = hidden_states + mlp_output
+        return (hidden_states, attn_wights) if output_attention else (hidden_states,)
+
+
+class AIMv2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`AIMv2EncoderLayer`].
+
+    Args:
+        config: AIMv2Config
+    """
+
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([AIMv2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class AIMv2Model(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        self.config = config
+        self.embeddings = AIMv2Embeddings(config)
+        self.encoder = AIMv2Encoder(config)
+        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+
+    def forward(
+        self,
+        pixel_values,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.rms_norm(last_hidden_state)
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+__all__ = ["AIMv2Model"]
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
new file mode 100644
index 00000000000..8cccbad64fb
--- /dev/null
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2025 Apple Inc. and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+from .configuration_aimv2 import AIMv2Config
+from torch import nn
+from torch.nn import functional as F
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from ...activations import ACT2FN
+from ..llama.modeling_llama import LlamaRMSNorm
+from ..siglip.modeling_siglip import SiglipAttention, SiglipEncoder
+
+
+class AIMv2PreTrainedModel(PreTrainedModel):
+    pass
+
+class AIMv2RMSNorm(LlamaRMSNorm):
+    pass 
+
+class AIMv2SwiGLUFFN(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        in_features = config.hidden_size
+        out_features = config.intermediate_size
+        self.act_fn = config.hidden_act
+
+        self.fc1 = nn.Linear(in_features, out_features, bias=config.use_bias)
+        self.fc2 = nn.Linear(out_features, in_features, bias=config.use_bias)
+        self.fc3 = nn.Linear(in_features, out_features, bias=config.use_bias)
+
+    def forward(self, hidden_states:torch.Tensor) -> torch.Tensor:
+        fc3_out = self.fc3(hidden_states)
+        fc1_out = self.fc1(hidden_states)
+        hidden_states = ACT2FN[self.act_fn](fc1_out) * fc3_out
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+    
+class AIMv2Embeddings(nn.Module):
+    def __init__(self, config:AIMv2Config ):
+        self.patch_embed = nn.Conv2d(config.num_channels, config.hidden_size, kernel_size= config.patch_size, stride=config.patch_size)
+        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+
+        num_patches = (config.image_size // config.patch_size) ** 2
+        self.position_embeddings = nn.Embedding(num_patches, config.hidden_size)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(height, width, embed_dim):
+        pass
+
+    def forward(self, pixel_values:torch.Tensor) -> torch.Tensor:
+        hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2)
+        hidden_states = self.norm(hidden_states)
+
+        _, num_patches, _ = hidden_states.size()
+
+        # added logic for native in build s2d sincos pos embed
+        hidden_states = hidden_states +self.position_embeddings
+
+        return hidden_states
+
+class AIMv2Attention(SiglipAttention):
+    pass
+
+class AIMv2EncoderLayer(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        self.attention = AIMv2Attention(config)
+        self.ffn = AIMv2SwiGLUFFN(config)
+        self.rms_norm1 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor, attention_mask, output_attention:Optional[bool]=False) -> Tuple[torch.Tensor, torch.Tensor]:
+        norm_hidden_states = self.rms_norm1(hidden_states)
+        attn_output, attn_wights = self.attention(hidden_states=norm_hidden_states,
+                                          attention_mask=attention_mask,
+                                          output_attention=output_attention)
+
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.rms_norm2(hidden_states)
+        mlp_output = self.ffn(norm_hidden_states)
+
+        hidden_states = hidden_states + mlp_output
+        return (hidden_states, attn_wights) if output_attention else (hidden_states,)
+
+class AIMv2Encoder(SiglipEncoder):
+    pass
+
+class AIMv2Model(nn.Module):
+    def __init__(self,config: AIMv2Config):
+        super().__init__()
+        self.config = config
+        self.embeddings = AIMv2Embeddings(config)
+        self.encoder = AIMv2Encoder(config)
+        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+
+    def forward(
+        self,
+        pixel_values,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.rms_norm(last_hidden_state)
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+__all__ = ["AIMv2Model"]
\ No newline at end of file
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 36c84eac8d1..f7f648cf97e 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -32,6 +32,7 @@ logger = logging.get_logger(__name__)
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
+        ("aimv2", "AIMv2Config"),
         ("albert", "AlbertConfig"),
         ("align", "AlignConfig"),
         ("altclip", "AltCLIPConfig"),
@@ -64,7 +65,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
         ("chinese_clip_vision_model", "ChineseCLIPVisionConfig"),
         ("clap", "ClapConfig"),
         ("clip", "CLIPConfig"),
-        ("aimv2", "AIMv2Config"),
         ("clip_text_model", "CLIPTextConfig"),
         ("clip_vision_model", "CLIPVisionConfig"),
         ("clipseg", "CLIPSegConfig"),
@@ -359,6 +359,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
+        ("aimv2", "AIMv2"),
         ("albert", "ALBERT"),
         ("align", "ALIGN"),
         ("altclip", "AltCLIP"),
@@ -397,7 +398,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
         ("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
         ("clap", "CLAP"),
         ("clip", "CLIP"),
-        ("aimv2", "AIMv2"),
         ("clip_text_model", "CLIPTextModel"),
         ("clip_vision_model", "CLIPVisionModel"),
         ("clipseg", "CLIPSeg"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 9474de57501..ce6e276ba69 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,12 +39,12 @@ logger = logging.get_logger(__name__)
 
 FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
     [
+        ("aimv2", "AIMv2FeatureExtractor"),
         ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
         ("beit", "BeitFeatureExtractor"),
         ("chinese_clip", "ChineseCLIPFeatureExtractor"),
         ("clap", "ClapFeatureExtractor"),
         ("clip", "CLIPFeatureExtractor"),
-        ("aimv2", "AIMv2FeatureExtractor"),
         ("clipseg", "ViTFeatureExtractor"),
         ("clvp", "ClvpFeatureExtractor"),
         ("conditional_detr", "ConditionalDetrFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 77cdc245ee4..24fd4078846 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -33,6 +33,7 @@ logger = logging.get_logger(__name__)
 MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
+        ("aimv2", "AIMv2Model"),
         ("albert", "AlbertModel"),
         ("align", "AlignModel"),
         ("altclip", "AltCLIPModel"),
@@ -64,7 +65,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
         ("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
         ("clap", "ClapModel"),
         ("clip", "CLIPModel"),
-        ("aimv2", "AIMv2Model"),
         ("clip_text_model", "CLIPTextModel"),
         ("clip_vision_model", "CLIPVisionModel"),
         ("clipseg", "CLIPSegModel"),
@@ -679,10 +679,10 @@ MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
 MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Image Classification mapping
+        ("aimv2", "AIMv2ForImageClassification"),
         ("beit", "BeitForImageClassification"),
         ("bit", "BitForImageClassification"),
         ("clip", "CLIPForImageClassification"),
-        ("aimv2", "AIMv2ForImageClassification"),
         ("convnext", "ConvNextForImageClassification"),
         ("convnextv2", "ConvNextV2ForImageClassification"),
         ("cvt", "CvtForImageClassification"),
@@ -1412,13 +1412,13 @@ MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = OrderedDict(
 MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Zero Shot Image Classification mapping
+        ("aimv2", "AIMv2Model"),
         ("align", "AlignModel"),
         ("altclip", "AltCLIPModel"),
         ("blip", "BlipModel"),
         ("blip-2", "Blip2ForImageTextRetrieval"),
         ("chinese_clip", "ChineseCLIPModel"),
         ("clip", "CLIPModel"),
-        ("aimv2", "AIMv2Model"),
         ("clipseg", "CLIPSegModel"),
         ("siglip", "SiglipModel"),
         ("siglip2", "Siglip2Model"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 816993a7a5a..f456186054a 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -45,6 +45,7 @@ logger = logging.get_logger(__name__)
 
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
+        ("aimv2", "AIMv2Processor"),
         ("align", "AlignProcessor"),
         ("altclip", "AltCLIPProcessor"),
         ("aria", "AriaProcessor"),
@@ -57,7 +58,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
         ("chinese_clip", "ChineseCLIPProcessor"),
         ("clap", "ClapProcessor"),
         ("clip", "CLIPProcessor"),
-        ("aimv2", "AIMv2Processor"),
         ("clipseg", "CLIPSegProcessor"),
         ("clvp", "ClvpProcessor"),
         ("colpali", "ColPaliProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index f7112b2f18b..3d320b95205 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -60,6 +60,13 @@ if TYPE_CHECKING:
 else:
     TOKENIZER_MAPPING_NAMES = OrderedDict(
         [
+            (
+                "aimv2",
+                (
+                    "CLIPTokenizer",
+                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             (
                 "albert",
                 (
@@ -131,13 +138,6 @@ else:
                     "CLIPTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
-            (
-                "aimv2",
-                (
-                    "CLIPTokenizer",
-                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
-                ),
-            ),
             (
                 "clipseg",
                 (
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 296c1811729..20b15d43efc 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -388,7 +388,9 @@ class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = AIMv2VisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AIMv2VisionConfig, has_text_modality=False, hidden_size=37)
+        self.config_tester = ConfigTester(
+            self, config_class=AIMv2VisionConfig, has_text_modality=False, hidden_size=37
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -1111,7 +1113,9 @@ class AIMv2ModelIntegrationTest(unittest.TestCase):
         model = AIMv2Model.from_pretrained("apple/aimv2-large-patch14-224").to(torch_device)
 
         processor = CLIPProcessor.from_pretrained(
-            "apple/aimv2-large-patch14-224", size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
+            "apple/aimv2-large-patch14-224",
+            size={"height": 180, "width": 180},
+            crop_size={"height": 180, "width": 180},
         )
 
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")

From 58edea82946cf2f1e368890c40de2fe227deb325 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 8 Mar 2025 22:38:23 +0530
Subject: [PATCH 03/62] temp push

---
 .../models/aimv2/configuration_aimv2.py       |  18 +-
 .../convert_aimv2_original_pytorch_to_hf.py   | 158 ++++--------------
 .../models/aimv2/modeling_aimv2.py            |  21 ++-
 .../models/aimv2/modular_aimv2.py             | 155 +++++++++++++++--
 src/transformers/utils/dummy_pt_objects.py    |  13 ++
 5 files changed, 201 insertions(+), 164 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index 119df3c0892..50f1b62f973 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -14,21 +14,16 @@
 # limitations under the License.
 """AIMv2 model configuration"""
 
-from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional
+from typing import Any
 
 
-if TYPE_CHECKING:
-    from ...processing_utils import ProcessorMixin
-    from ...utils import TensorType
-
 from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
 from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
 
+
 class AIMv2Config(PretrainedConfig):
     """This is the configuration class to store the configuration of an [`AIMv2Model`].
     Instantiating a configuration with the defaults will yield a similar configuration
@@ -64,8 +59,10 @@ class AIMv2Config(PretrainedConfig):
         rms_norm_eps: float = 1e-5,
         attention_dropout: float = 0.0,
         projection_dropout: float = 0.0,
-        qkv_bias: bool = False,
+        attention_bias: bool = False,
         use_bias: bool = False,
+        hidden_act = 'silu',
+        initializer_range=0.02,
         **kwargs: Any,
     ):
         super().__init__(**kwargs)
@@ -80,7 +77,10 @@ class AIMv2Config(PretrainedConfig):
         self.rms_norm_eps = rms_norm_eps
 
         self.projection_dropout = projection_dropout
-        self.qkv_bias = qkv_bias
+        self.attention_bias = attention_bias
         self.use_bias = use_bias
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+
 
 __all__ = ["AIMv2Config"]
diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index db2a7285871..865b30981b4 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -16,141 +16,43 @@
 import argparse
 
 import torch
-from aimv2 import load
 
 from transformers import AIMv2Config, AIMv2Model
 
 
-def copy_attn_layer(hf_attn_layer, pt_attn_layer):
-    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
-    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
-
-    out_proj_weights = pt_attn_layer.out_proj.weight
-    out_proj_bias = pt_attn_layer.out_proj.bias
-
-    hf_attn_layer.q_proj.weight.data = q_proj
-    hf_attn_layer.q_proj.bias.data = q_proj_bias
-
-    hf_attn_layer.k_proj.weight.data = k_proj
-    hf_attn_layer.k_proj.bias.data = k_proj_bias
-
-    hf_attn_layer.v_proj.weight.data = v_proj
-    hf_attn_layer.v_proj.bias.data = v_proj_bias
-
-    hf_attn_layer.out_proj.weight = out_proj_weights
-    hf_attn_layer.out_proj.bias = out_proj_bias
-
-
-def copy_mlp(hf_mlp, pt_mlp):
-    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
-    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
-
-
-def copy_linear(hf_linear, pt_linear):
-    hf_linear.weight = pt_linear.weight
-    hf_linear.bias = pt_linear.bias
-
-
-def copy_layer(hf_layer, pt_layer):
-    # copy layer norms
-    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
-    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
-
-    # copy MLP
-    copy_mlp(hf_layer.mlp, pt_layer.mlp)
-
-    # copy attn
-    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
-
-
-def copy_layers(hf_layers, pt_layers):
-    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
-        copy_layer(hf_layer, pt_layer)
-
-
-def copy_encoder(hf_encoder, pt_model):
-    # copy  embeds
-    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
-    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
-
-    # copy layer norm
-    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
-
-    # copy hidden layers
-    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
-
-
-def copy_text_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous()
-
-    # copy text encoder
-    copy_encoder(hf_model.text_model, pt_model)
-
-
-def copy_vison_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous()
-
-    # copy layer norms
-    copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
-    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
-
-    # copy embeds
-    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
-    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
-    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
-
-    # copy encoder
-    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
-
-
-@torch.no_grad()
-def convert_aimv2_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = AIMv2Config.from_pretrained(config_path)
-    else:
-        config = AIMv2Config(projection_dim=512, text_config={}, vision_config={})
-
-    hf_model = AIMv2Model(config).eval()
-
-    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
-    pt_model = pt_model.eval()
-
-    copy_text_model_and_projection(hf_model, pt_model)
-    copy_vison_model_and_projection(hf_model, pt_model)
-    hf_model.logit_scale = pt_model.logit_scale
-
-    # Use `eos_token` so the example is more meaningful
-    input_ids = torch.tensor(
-        [
-            [config.text_config.bos_token_id]
-            + list(range(3, 77))
-            + [config.text_config.eos_token_id]
-            + [config.text_config.pad_token_id]
-        ]
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--hf_repo_id",
+        default="apple/DepthPro",
+        help="Location of official weights from apple on HF",
     )
-    pixel_values = torch.randn(1, 3, 224, 224)
+    parser.add_argument(
+        "--output_dir",
+        default="apple_DepthPro",
+        help="Location to write the converted model and processor",
+    )
+    parser.add_argument(
+        "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action=argparse.BooleanOptionalAction,
+        help="Whether or not to push the converted model to the huggingface hub.",
+    )
+    parser.add_argument(
+        "--hub_repo_id",
+        default="apple/DepthPro-hf",
+        help="Huggingface hub repo to write the converted model and processor",
+    )
+    args = parser.parse_args()
 
-    hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
-    hf_logits_per_image = hf_outputs.logits_per_image
-    hf_logits_per_text = hf_outputs.logits_per_text
-    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
 
-    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
-    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
+    if args.push_to_hub:
+        print("Pushing to hub...")
+        # model.push_to_hub(args.hub_repo_id)
+        # image_processor.push_to_hub(args.hub_repo_id)
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-
-    convert_aimv2_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
+    main()
\ No newline at end of file
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 7b9010d7828..1127972beba 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -103,7 +103,7 @@ class AIMv2Embeddings(nn.Module):
 class AIMv2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config):
+    def __init__(self, config: AIMv2Config):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -115,12 +115,12 @@ class AIMv2Attention(nn.Module):
                 f" {self.num_heads})."
             )
         self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
 
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
+        self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
+        self.proj_drop = nn.Dropout(config.projection_dropout)
 
     def forward(
         self,
@@ -170,9 +170,12 @@ class AIMv2Attention(nn.Module):
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
 
-        attn_output = self.out_proj(attn_output)
+        attn_output = self.proj_out(attn_output)
+        attn_output = self.proj_drop(attn_output)
 
-        return attn_output, attn_weights
+        output = (attn_output, attn_weights) if output_attentions else (attn_output,)
+
+        return output
 
 
 class AIMv2EncoderLayer(nn.Module):
@@ -329,4 +332,4 @@ class AIMv2Model(nn.Module):
         )
 
 
-__all__ = ["AIMv2Model"]
+__all__ = ["AIMv2Model", "AIMv2PreTrainedModel"]
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 8cccbad64fb..f3484ee3b34 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -13,24 +13,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, List
 
 import torch
-from .configuration_aimv2 import AIMv2Config
 from torch import nn
-from torch.nn import functional as F
+
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.modeling_utils import PreTrainedModel
+
 from ...activations import ACT2FN
 from ..llama.modeling_llama import LlamaRMSNorm
 from ..siglip.modeling_siglip import SiglipAttention, SiglipEncoder
+from .configuration_aimv2 import AIMv2Config
 
 
 class AIMv2PreTrainedModel(PreTrainedModel):
     pass
 
+
 class AIMv2RMSNorm(LlamaRMSNorm):
-    pass 
+    pass
+
 
 class AIMv2SwiGLUFFN(nn.Module):
     def __init__(self, config: AIMv2Config):
@@ -43,16 +46,19 @@ class AIMv2SwiGLUFFN(nn.Module):
         self.fc2 = nn.Linear(out_features, in_features, bias=config.use_bias)
         self.fc3 = nn.Linear(in_features, out_features, bias=config.use_bias)
 
-    def forward(self, hidden_states:torch.Tensor) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         fc3_out = self.fc3(hidden_states)
         fc1_out = self.fc1(hidden_states)
         hidden_states = ACT2FN[self.act_fn](fc1_out) * fc3_out
         hidden_states = self.fc2(hidden_states)
         return hidden_states
-    
+
+
 class AIMv2Embeddings(nn.Module):
-    def __init__(self, config:AIMv2Config ):
-        self.patch_embed = nn.Conv2d(config.num_channels, config.hidden_size, kernel_size= config.patch_size, stride=config.patch_size)
+    def __init__(self, config: AIMv2Config):
+        self.patch_embed = nn.Conv2d(
+            config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
+        )
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         num_patches = (config.image_size // config.patch_size) ** 2
@@ -62,19 +68,95 @@ class AIMv2Embeddings(nn.Module):
     def build_2d_sincos_position_embedding(height, width, embed_dim):
         pass
 
-    def forward(self, pixel_values:torch.Tensor) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2)
         hidden_states = self.norm(hidden_states)
 
         _, num_patches, _ = hidden_states.size()
 
         # added logic for native in build s2d sincos pos embed
-        hidden_states = hidden_states +self.position_embeddings
+        hidden_states = hidden_states + self.position_embeddings
 
         return hidden_states
 
-class AIMv2Attention(SiglipAttention):
-    pass
+
+class AIMv2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config:AIMv2Config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
+        self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
+        self.proj_drop = nn.Dropout(config.projection_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.proj_out(attn_output)
+        attn_output = self.proj_drop(attn_output)
+
+        output = (attn_output, attn_weights) if output_attentions else (attn_output,)
+
+        return output
+
 
 class AIMv2EncoderLayer(nn.Module):
     def __init__(self, config: AIMv2Config):
@@ -84,11 +166,13 @@ class AIMv2EncoderLayer(nn.Module):
         self.rms_norm1 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
         self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
-    def forward(self, hidden_states: torch.Tensor, attention_mask, output_attention:Optional[bool]=False) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self, hidden_states: torch.Tensor, attention_mask, output_attention: Optional[bool] = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
-        attn_output, attn_wights = self.attention(hidden_states=norm_hidden_states,
-                                          attention_mask=attention_mask,
-                                          output_attention=output_attention)
+        attn_output, attn_wights = self.attention(
+            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attention=output_attention
+        )
 
         hidden_states = hidden_states + attn_output
         norm_hidden_states = self.rms_norm2(hidden_states)
@@ -97,17 +181,51 @@ class AIMv2EncoderLayer(nn.Module):
         hidden_states = hidden_states + mlp_output
         return (hidden_states, attn_wights) if output_attention else (hidden_states,)
 
+
 class AIMv2Encoder(SiglipEncoder):
     pass
 
+
+class AIMv2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = AIMv2Config
+    base_model_prefix = "aimv2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["AIMv2SwiGLUFFN"]
+    _supports_sdpa = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, AIMv2Embeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
 class AIMv2Model(nn.Module):
-    def __init__(self,config: AIMv2Config):
+    def __init__(self, config: AIMv2Config):
         super().__init__()
         self.config = config
         self.embeddings = AIMv2Embeddings(config)
         self.encoder = AIMv2Encoder(config)
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     def forward(
         self,
         pixel_values,
@@ -141,4 +259,5 @@ class AIMv2Model(nn.Module):
             attentions=encoder_outputs.attentions,
         )
 
-__all__ = ["AIMv2Model"]
\ No newline at end of file
+
+__all__ = ["AIMv2Model"]
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 85eea3cb100..1238cdadc35 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -555,6 +555,19 @@ class PreTrainedModel(metaclass=DummyObject):
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
+class AIMv2Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AIMv2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
 
 class AlbertForMaskedLM(metaclass=DummyObject):
     _backends = ["torch"]

From 429e5b6348ceb0f65287715d897a4e69dd03a5d1 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sun, 9 Mar 2025 21:23:53 +0530
Subject: [PATCH 04/62] changes

---
 .../models/aimv2/configuration_aimv2.py       |   27 +-
 .../convert_aimv2_original_pytorch_to_hf.py   |  150 ++-
 .../models/aimv2/modeling_aimv2.py            |  133 +-
 .../models/aimv2/modular_aimv2.py             |  105 +-
 .../models/auto/processing_auto.py            |    1 -
 src/transformers/utils/dummy_pt_objects.py    |    1 +
 tests/models/aimv2/test_modeling_aimv2.py     | 1126 +++--------------
 7 files changed, 454 insertions(+), 1089 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index 50f1b62f973..11c7c024823 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -14,9 +14,6 @@
 # limitations under the License.
 """AIMv2 model configuration"""
 
-from typing import Any
-
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -25,26 +22,6 @@ logger = logging.get_logger(__name__)
 
 
 class AIMv2Config(PretrainedConfig):
-    """This is the configuration class to store the configuration of an [`AIMv2Model`].
-    Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
-    Args:
-        hidden_size: Dimension of the hidden representations.
-        intermediate_size: Dimension of the SwiGLU representations.
-        num_hidden_layers: Number of hidden layers in the Transformer.
-        num_attention_heads: Number of attention heads for each attention layer
-            in the Transformer.
-        num_channels: Number of input channels.
-        image_size: Image size.
-        patch_size: Patch size.
-        rms_norm_eps: Epsilon value used for the RMS normalization layer.
-        attention_dropout: Dropout ratio for attention probabilities.
-        projection_dropout: Dropout ratio for the projection layer after the attention.
-        qkv_bias: Whether to add a bias to the queries, keys and values.
-        use_bias: Whether to add a bias in the feed-forward and projection layers.
-        kwargs: Keyword arguments for the [`PretrainedConfig`].
-    """
-
     model_type: str = "aimv2"
 
     def __init__(
@@ -61,9 +38,9 @@ class AIMv2Config(PretrainedConfig):
         projection_dropout: float = 0.0,
         attention_bias: bool = False,
         use_bias: bool = False,
-        hidden_act = 'silu',
+        hidden_act="silu",
         initializer_range=0.02,
-        **kwargs: Any,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index 865b30981b4..63c6fad70d7 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -14,22 +14,146 @@
 # limitations under the License.
 
 import argparse
+import gc
+import os
+import re
+from typing import Dict, Optional
 
 import torch
+from huggingface_hub import snapshot_download
+from safetensors import safe_open
 
-from transformers import AIMv2Config, AIMv2Model
+from transformers import AIMv2Config, AIMv2Model, AutoProcessor
+
+
+NEW_MODEL_KEY_MAPPING = {
+    # Embeddings
+    r"preprocessor.patchifier.proj": r"embeddings.patch_embed",
+    r"preprocessor.pos_embed": r"embeddings.position_embeddings.weight",
+    r"preprocessor.patchifier.norm.weight": r"embeddings.rms_norm.weight",
+    # Encoder Layers
+    r"trunk.blocks.(\d+).attn.qkv": r"encoder.layers.\1.attention.qkv",
+    r"trunk.blocks.(\d+).attn.proj": r"encoder.layers.\1.attention.proj_out",
+    r"trunk.blocks.(\d+).mlp.fc1": r"encoder.layers.\1.ffn.fc1",
+    r"trunk.blocks.(\d+).mlp.fc2": r"encoder.layers.\1.ffn.fc2",
+    r"trunk.blocks.(\d+).mlp.fc3": r"encoder.layers.\1.ffn.fc3",
+    # Normalization Layers
+    r"trunk.blocks.(\d+).norm_1": r"encoder.layers.\1.rms_norm1",
+    r"trunk.blocks.(\d+).norm_2": r"encoder.layers.\1.rms_norm2",
+    # Final Norm
+    r"trunk.post_trunk_norm": r"rms_norm",
+}
+
+
+def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> Dict[str, torch.Tensor]:
+    # Download only the model.safetensors file
+    directory_path = snapshot_download(
+        repo_id=model_id,
+        revision=revision,
+        allow_patterns=["model.safetensors"],
+    )
+
+    original_state_dict = {}
+    safetensor_path = f"{directory_path}/model.safetensors"
+
+    with safe_open(safetensor_path, framework="pt", device="cpu") as f:
+        for key in f.keys():
+            original_state_dict[key] = f.get_tensor(key)
+
+    return original_state_dict
+
+
+def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
+    """Converts state dict keys from the old format to the new format."""
+
+    output_dict = {}
+    if state_dict_keys is not None:
+        old_text = "\n".join(state_dict_keys)
+        new_text = old_text
+        for pattern, replacement in NEW_MODEL_KEY_MAPPING.items():
+            if replacement is None:
+                new_text = re.sub(pattern, "", new_text)  # an empty line
+                continue
+            new_text = re.sub(pattern, replacement, new_text)
+        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
+    return output_dict
+
+
+def split_qkv_tensor(key, tensor):
+    """Splits a qkv tensor into separate q, k, v tensors and updates the key accordingly."""
+
+    new_keys = ["q_proj", "k_proj", "v_proj"]
+    split_size = tensor.shape[0] // 3
+    split_tensors = torch.split(tensor, split_size, dim=0)
+
+    return {key.replace("qkv", new_key): split_tensors[i] for i, new_key in enumerate(new_keys)}
+
+
+def write_model(
+    hf_repo_id: str,
+    output_dir: str,
+    safe_serialization: bool = True,
+):
+    os.makedirs(output_dir, exist_ok=True)
+
+    # create config
+    config = AIMv2Config.from_pretrained(hf_repo_id)
+
+    # Load original model state dict
+    original_state_dict = load_original_state_dict("apple/aimv2-large-patch14-224")
+
+    print("Converting model...")
+    state_dict = {}
+    result = convert_old_keys_to_new_keys(original_state_dict)
+    all_keys = list(original_state_dict.keys())
+
+    for key in all_keys:
+        value = original_state_dict[key]
+        new_key = result.pop(key)
+
+        if "qkv" in new_key:
+            qkv_state_dict = split_qkv_tensor(new_key, value)
+            state_dict.update(qkv_state_dict)
+        else:
+            state_dict[new_key] = value
+
+    state_dict["embeddings.position_embeddings.weight"] = state_dict["embeddings.position_embeddings.weight"].squeeze(
+        0
+    )
+
+    print("Loading the checkpoint in a DepthPro model.")
+    model = AIMv2Model(config)
+    model.load_state_dict(state_dict, strict=True, assign=True)
+    print("Checkpoint loaded successfully.")
+
+    print("Saving the model.")
+    model.save_pretrained(output_dir, safe_serialization=safe_serialization)
+    del state_dict, model
+
+    # Safety check: reload the converted model
+    gc.collect()
+    print("Reloading the model to check if it's saved correctly.")
+    model = AIMv2Model.from_pretrained(output_dir, device_map="auto")
+    print("Model reloaded successfully.")
+    return model
+
+
+def write_image_processor(hf_repo_id: str, output_dir: str):
+    image_processor = AutoProcessor.from_pretrained(hf_repo_id, use_fast=True)
+    image_processor.save_pretrained(output_dir)
+    return image_processor
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--hf_repo_id",
-        default="apple/DepthPro",
+        default="apple/aimv2-large-patch14-224",
         help="Location of official weights from apple on HF",
     )
     parser.add_argument(
         "--output_dir",
-        default="apple_DepthPro",
+        default="aimv2_model",
         help="Location to write the converted model and processor",
     )
     parser.add_argument(
@@ -42,17 +166,29 @@ def main():
     )
     parser.add_argument(
         "--hub_repo_id",
-        default="apple/DepthPro-hf",
+        default=None,
         help="Huggingface hub repo to write the converted model and processor",
     )
     args = parser.parse_args()
 
+    model = write_model(
+        hf_repo_id=args.hf_repo_id,
+        output_dir=args.output_dir,
+        safe_serialization=args.safe_serialization,
+    )
+
+    image_processor = write_image_processor(
+        hf_repo_id=args.hf_repo_id,
+        output_dir=args.output_dir,
+    )
 
     if args.push_to_hub:
         print("Pushing to hub...")
-        # model.push_to_hub(args.hub_repo_id)
-        # image_processor.push_to_hub(args.hub_repo_id)
+        model.push_to_hub(args.hub_repo_id)
+        image_processor.push_to_hub(args.hub_repo_id)
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
+
+# python src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py.py --hf_repo_id apple/aimv2-large-patch14-224 --output_dir tmp/aimv2 --safe_serialization
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 1127972beba..2fbf5ec2ac5 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -19,20 +19,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 from torch import nn
 
 from transformers.modeling_outputs import BaseModelOutput
-from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
+from ...utils import logging
 from .configuration_aimv2 import AIMv2Config
 
 
-class AIMv2PreTrainedModel(PreTrainedModel):
-    pass
+logger = logging.get_logger(__name__)
 
 
 class AIMv2RMSNorm(nn.Module):
@@ -76,6 +77,7 @@ class AIMv2SwiGLUFFN(nn.Module):
 
 class AIMv2Embeddings(nn.Module):
     def __init__(self, config: AIMv2Config):
+        super().__init__()
         self.patch_embed = nn.Conv2d(
             config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
         )
@@ -83,6 +85,7 @@ class AIMv2Embeddings(nn.Module):
 
         num_patches = (config.image_size // config.patch_size) ** 2
         self.position_embeddings = nn.Embedding(num_patches, config.hidden_size)
+        self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
 
     @staticmethod
     def build_2d_sincos_position_embedding(height, width, embed_dim):
@@ -90,16 +93,42 @@ class AIMv2Embeddings(nn.Module):
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2)
-        hidden_states = self.norm(hidden_states)
+        hidden_states = self.rms_norm(hidden_states)
 
         _, num_patches, _ = hidden_states.size()
 
         # added logic for native in build s2d sincos pos embed
-        hidden_states = hidden_states + self.position_embeddings
+        hidden_states = hidden_states + self.position_embeddings(self.position_ids)
 
         return hidden_states
 
 
+# Replace attn_mask with head mask
+def eager_attention_forward(
+    module: nn.Module,
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling
+
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+
+    # Only apply attention dropout during training.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 class AIMv2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -127,6 +156,7 @@ class AIMv2Attention(nn.Module):
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
@@ -140,40 +170,34 @@ class AIMv2Attention(nn.Module):
         key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
 
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
-
-        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                 )
-            attn_weights = attn_weights + attention_mask
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scale,
+            is_causal=False,
+            **kwargs,
+        )
 
-        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
 
         attn_output = self.proj_out(attn_output)
         attn_output = self.proj_drop(attn_output)
 
-        output = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        output = (attn_output, attn_weights) if output_attentions else (attn_output, None)
 
         return output
 
@@ -187,11 +211,11 @@ class AIMv2EncoderLayer(nn.Module):
         self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
     def forward(
-        self, hidden_states: torch.Tensor, attention_mask, output_attention: Optional[bool] = False
+        self, hidden_states: torch.Tensor, attention_mask, output_attentions: Optional[bool] = False
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
         attn_output, attn_wights = self.attention(
-            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attention=output_attention
+            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
         )
 
         hidden_states = hidden_states + attn_output
@@ -199,7 +223,7 @@ class AIMv2EncoderLayer(nn.Module):
         mlp_output = self.ffn(norm_hidden_states)
 
         hidden_states = hidden_states + mlp_output
-        return (hidden_states, attn_wights) if output_attention else (hidden_states,)
+        return (hidden_states, attn_wights) if output_attentions else (hidden_states, None)
 
 
 class AIMv2Encoder(nn.Module):
@@ -290,14 +314,47 @@ class AIMv2Encoder(nn.Module):
         )
 
 
-class AIMv2Model(nn.Module):
+class AIMv2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = AIMv2Config
+    base_model_prefix = "aimv2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["AIMv2SwiGLUFFN"]
+    _supports_sdpa = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, AIMv2Embeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+
+class AIMv2Model(AIMv2PreTrainedModel):
     def __init__(self, config: AIMv2Config):
-        super().__init__()
+        super().__init__(config)
         self.config = config
         self.embeddings = AIMv2Embeddings(config)
         self.encoder = AIMv2Encoder(config)
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
+        # Initialize weights and apply final processing
+        self.post_init()
+
     def forward(
         self,
         pixel_values,
@@ -332,4 +389,4 @@ class AIMv2Model(nn.Module):
         )
 
 
-__all__ = ["AIMv2Model", "AIMv2PreTrainedModel"]
+__all__ = ["AIMv2Model"]
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index f3484ee3b34..69419cd3c9c 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -13,22 +13,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union, List
+"""Pytorch implementation of AIMv2 Model"""
+
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 from torch import nn
 
 from transformers.modeling_outputs import BaseModelOutput
-from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
+from ...utils import (
+    logging,
+)
 from ..llama.modeling_llama import LlamaRMSNorm
-from ..siglip.modeling_siglip import SiglipAttention, SiglipEncoder
+from ..siglip.modeling_siglip import SiglipEncoder
 from .configuration_aimv2 import AIMv2Config
 
 
-class AIMv2PreTrainedModel(PreTrainedModel):
-    pass
+logger = logging.get_logger(__name__)
 
 
 class AIMv2RMSNorm(LlamaRMSNorm):
@@ -56,6 +60,7 @@ class AIMv2SwiGLUFFN(nn.Module):
 
 class AIMv2Embeddings(nn.Module):
     def __init__(self, config: AIMv2Config):
+        super().__init__()
         self.patch_embed = nn.Conv2d(
             config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
         )
@@ -63,6 +68,7 @@ class AIMv2Embeddings(nn.Module):
 
         num_patches = (config.image_size // config.patch_size) ** 2
         self.position_embeddings = nn.Embedding(num_patches, config.hidden_size)
+        self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
 
     @staticmethod
     def build_2d_sincos_position_embedding(height, width, embed_dim):
@@ -70,20 +76,45 @@ class AIMv2Embeddings(nn.Module):
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2)
-        hidden_states = self.norm(hidden_states)
+        hidden_states = self.rms_norm(hidden_states)
 
         _, num_patches, _ = hidden_states.size()
 
         # added logic for native in build s2d sincos pos embed
-        hidden_states = hidden_states + self.position_embeddings
+        hidden_states = hidden_states + self.position_embeddings(self.position_ids)
 
         return hidden_states
 
+# Replace atttn_mask with head mask
+def eager_attention_forward(
+    module: nn.Module,
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling
+
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+
+    # Only apply attention dropout during training.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
 
 class AIMv2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config:AIMv2Config):
+    def __init__(self, config: AIMv2Config):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -107,6 +138,7 @@ class AIMv2Attention(nn.Module):
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
@@ -120,40 +152,34 @@ class AIMv2Attention(nn.Module):
         key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
 
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
-
-        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                 )
-            attn_weights = attn_weights + attention_mask
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scale,
+            is_causal=False,
+            **kwargs,
+        )
 
-        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
 
         attn_output = self.proj_out(attn_output)
         attn_output = self.proj_drop(attn_output)
 
-        output = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        output = (attn_output, attn_weights) if output_attentions else (attn_output, None)
 
         return output
 
@@ -167,11 +193,11 @@ class AIMv2EncoderLayer(nn.Module):
         self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
     def forward(
-        self, hidden_states: torch.Tensor, attention_mask, output_attention: Optional[bool] = False
+        self, hidden_states: torch.Tensor, attention_mask, output_attentions: Optional[bool] = False
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
         attn_output, attn_wights = self.attention(
-            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attention=output_attention
+            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
         )
 
         hidden_states = hidden_states + attn_output
@@ -179,7 +205,7 @@ class AIMv2EncoderLayer(nn.Module):
         mlp_output = self.ffn(norm_hidden_states)
 
         hidden_states = hidden_states + mlp_output
-        return (hidden_states, attn_wights) if output_attention else (hidden_states,)
+        return (hidden_states, attn_wights) if output_attentions else (hidden_states, None)
 
 
 class AIMv2Encoder(SiglipEncoder):
@@ -215,9 +241,10 @@ class AIMv2PreTrainedModel(PreTrainedModel):
                 std=self.config.initializer_range,
             ).to(module.position_embeddings.dtype)
 
-class AIMv2Model(nn.Module):
+
+class AIMv2Model(AIMv2PreTrainedModel):
     def __init__(self, config: AIMv2Config):
-        super().__init__()
+        super().__init__(config)
         self.config = config
         self.embeddings = AIMv2Embeddings(config)
         self.encoder = AIMv2Encoder(config)
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index f456186054a..5b699a4a44d 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -45,7 +45,6 @@ logger = logging.get_logger(__name__)
 
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
-        ("aimv2", "AIMv2Processor"),
         ("align", "AlignProcessor"),
         ("altclip", "AltCLIPProcessor"),
         ("aria", "AriaProcessor"),
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 1238cdadc35..dc70594f1fc 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -555,6 +555,7 @@ class PreTrainedModel(metaclass=DummyObject):
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
+
 class AIMv2Model(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 20b15d43efc..9e1690238c5 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,46 +12,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Testing suite for the PyTorch AIMv2 model."""
+"""Testing suite for the PyTorch ViT model."""
 
-import inspect
-import os
-import tempfile
 import unittest
-from typing import Optional, Tuple
 
-import numpy as np
-import requests
-from parameterized import parameterized
-from pytest import mark
-
-from transformers import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
+from transformers import ViTConfig
 from transformers.testing_utils import (
-    require_flash_attn,
+    require_accelerate,
     require_torch,
-    require_torch_gpu,
-    require_torch_sdpa,
+    require_torch_accelerator,
+    require_torch_fp16,
     require_vision,
     slow,
     torch_device,
 )
-from transformers.utils import (
-    is_torch_available,
-    is_torch_bf16_available_on_device,
-    is_torch_fp16_available_on_device,
-    is_torch_sdpa_available,
-    is_vision_available,
-)
+from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    is_flaky,
-    random_attention_mask,
-)
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -59,44 +37,37 @@ if is_torch_available():
     import torch
     from torch import nn
 
-    from transformers import (
-        AIMv2ForImageClassification,
-        AIMv2Model,
-        AIMv2TextModel,
-        AIMv2TextModelWithProjection,
-        AIMv2VisionModel,
-        AIMv2VisionModelWithProjection,
-    )
-
-
-if is_torch_sdpa_available():
-    from torch.nn.attention import SDPBackend, sdpa_kernel
+    from transformers import AIMv2Model
 
 
 if is_vision_available():
     from PIL import Image
-
-    from transformers import CLIPProcessor
+    from transformers import AutoImageProcessor
 
 
-class AIMv2VisionModelTester:
+class ViTModelTester:
     def __init__(
         self,
         parent,
-        batch_size=12,
+        batch_size=13,
         image_size=30,
         patch_size=2,
         num_channels=3,
         is_training=True,
+        use_labels=True,
         hidden_size=32,
-        projection_dim=32,
         num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
+        hidden_act="silu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
         initializer_range=0.02,
         scope=None,
+        encoder_stride=2,
+        mask_ratio=0.5,
+        attn_implementation="eager",
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -104,298 +75,135 @@ class AIMv2VisionModelTester:
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.is_training = is_training
+        self.use_labels = use_labels
         self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.scope = scope
+        self.encoder_stride = encoder_stride
+        self.attn_implementation = attn_implementation
 
         # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
+        self.mask_ratio = mask_ratio
+        self.num_masks = int(mask_ratio * self.seq_length)
+        self.mask_length = num_patches
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
         config = self.get_config()
 
-        return config, pixel_values
+        return config, pixel_values, labels
 
     def get_config(self):
-        return AIMv2VisionConfig(
+        return ViTConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
             hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
             intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
             initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+            attn_implementation=self.attn_implementation,
         )
 
-    def create_and_check_model(self, config, pixel_values):
-        model = AIMv2VisionModel(config=config)
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = AIMv2Model(config=config)
         model.to(torch_device)
         model.eval()
-        with torch.no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+        result = model(pixel_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_with_projection(self, config, pixel_values):
-        model = AIMv2VisionModelWithProjection(config=config)
+
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = ViTForImageClassification(config)
         model.to(torch_device)
         model.eval()
-        with torch.no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+        # test greyscale images
+        config.num_channels = 1
+        model = ViTForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
+        (
+            config,
+            pixel_values,
+            labels,
+        ) = config_and_inputs
         inputs_dict = {"pixel_values": pixel_values}
         return config, inputs_dict
 
 
-class AIMv2ModelTesterMixin(ModelTesterMixin):
-    """
-    Subclass of ModelTesterMixin with methods specific to testing AIMv2 models.
-    The SDPA equivalence test is overridden here because AIMv2 models may have test/vision/text+vision inputs,
-    different output logits, and are not supposed to be used or tested with padding_side="left".
-    """
-
-    def test_sdpa_can_dispatch_composite_models(self):
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # Load the model with SDPA
-                model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                # Load model with eager attention
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    attn_implementation="eager",
-                )
-                model_eager = model_eager.eval().to(torch_device)
-
-            # SigLip has one shared cls attr for all models, so we assign both submodels heer
-            vision_attn = text_attn = "sdpa" if model._supports_sdpa else "eager"
-
-            # `None` as it is the requested one which will be assigned to each sub-config
-            # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
-            if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "text_model"):
-                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
-                self.assertTrue(model_sdpa.text_model.config._attn_implementation == text_attn)
-                self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
-                self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
-
-            self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-            self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-            for name, submodule in model_eager.named_modules():
-                class_name = submodule.__class__.__name__
-                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                    raise ValueError("The eager model should not have SDPA attention layers")
-
-            has_sdpa = False
-            for name, submodule in model_sdpa.named_modules():
-                class_name = submodule.__class__.__name__
-                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                    has_sdpa = True
-                    break
-            if not has_sdpa and model_sdpa.config.model_type != "falcon":
-                raise ValueError("The SDPA model should have SDPA attention layers")
-
-    def test_eager_matches_sdpa_inference(
-        self,
-        torch_dtype: str,
-        use_attention_mask_options: Tuple[Optional[str], ...] = (None, "left", "right"),
-        logit_keys: Tuple[str, ...] = ("logits_per_image", "logits_per_text", "image_embeds", "text_embeds"),
-    ):
-        if not self.all_model_classes[0]._supports_sdpa:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Convert to torch dtype
-        dtypes = {
-            "float16": torch.float16,
-            "bfloat16": torch.bfloat16,
-            "float32": torch.float32,
-        }
-        torch_dtype = dtypes[torch_dtype]
-
-        atols = {
-            torch.float32: 1e-5,
-            torch.bfloat16: 3e-2,
-            torch.float16: 5e-3,
-        }
-        rtols = {
-            torch.float32: 1e-4,
-            torch.bfloat16: 3e-2,
-            torch.float16: 5e-3,
-        }
-
-        atol = atols[torch_dtype]
-        rtol = rtols[torch_dtype]
-
-        def get_mean_reldiff(msg, current_case, x, ref, atol, rtol):
-            return f"{msg} {current_case}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # Load the model with SDPA
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                # Load model with eager attention
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                )
-                model_eager = model_eager.eval().to(torch_device)
-
-            # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving the model each time,
-            # but it would be nicer to have an efficient way to use parameterized.expand
-            cases = [
-                (use_mask, output_attentions, sdpa_backend, batch_size)
-                for use_mask in use_attention_mask_options
-                for output_attentions in [True, False]
-                for sdpa_backend in [
-                    [SDPBackend.MATH],
-                    [SDPBackend.FLASH_ATTENTION, SDPBackend.MATH],
-                    [SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH],
-                    [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH],
-                ]
-                for batch_size in [1, 5]
-            ]
-            fail_cases = []
-
-            for use_mask, output_attentions, sdpa_backend, batch_size in cases:
-                processed_inputs = inputs_dict.copy()
-
-                # convert to torch_dtype
-                if "pixel_values" in processed_inputs:
-                    processed_inputs["pixel_values"] = processed_inputs["pixel_values"].to(torch_dtype)
-
-                # slice for different batch sizes
-                for key in ["pixel_values", "input_ids", "attention_mask"]:
-                    if key in processed_inputs:
-                        processed_inputs[key] = processed_inputs[key][:batch_size]
-
-                # set attention mask with left padding
-                if not use_mask:
-                    processed_inputs.pop("attention_mask", None)
-                elif use_mask == "left":
-                    dummy_attention_mask = processed_inputs["attention_mask"]
-                    dummy_attention_mask[:] = 1
-                    dummy_attention_mask[:, :1] = 0
-                    processed_inputs["attention_mask"] = dummy_attention_mask
-                elif use_mask == "right":
-                    dummy_attention_mask = processed_inputs["attention_mask"]
-                    dummy_attention_mask[:] = 1
-                    dummy_attention_mask[:, -1:] = 0
-                    processed_inputs["attention_mask"] = dummy_attention_mask
-                else:
-                    raise ValueError(f"Invalid value for use_mask={use_mask}")
-
-                processed_inputs["output_attentions"] = output_attentions
-                processed_inputs["output_hidden_states"] = True
-
-                current_case = f"use_mask={use_mask}, batch_size={batch_size}, sdpa_backend={sdpa_backend}"
-
-                prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
-
-                with torch.no_grad():
-                    try:
-                        with sdpa_kernel(sdpa_backend):
-                            outputs_eager = model_eager(**prepared_inputs)
-                            outputs_sdpa = model_sdpa(**prepared_inputs)
-                    except Exception as e:
-                        fail_cases.append(f"{current_case}: {e}")
-                        continue
-
-                keys = set(logit_keys) & set(outputs_eager.keys())
-                self.assertTrue(
-                    keys, f"Keys {logit_keys} not found in outputs. Available keys: {outputs_eager.keys()}"
-                )
-
-                for key in keys:
-                    try:
-                        eager_logits = outputs_eager[key]
-                        sdpa_logits = outputs_sdpa[key]
-                    except KeyError:
-                        raise KeyError(f"Key {key} not found in outputs. Available keys: {outputs_eager.keys()}")
-
-                    if "hidden_state" in key and use_mask == "left":
-                        eager_logits = eager_logits[:, 1:]
-                        sdpa_logits = sdpa_logits[:, 1:]
-                    elif "hidden_state" in key and use_mask == "right":
-                        eager_logits = eager_logits[:, :-1]
-                        sdpa_logits = sdpa_logits[:, :-1]
-
-                    is_close = torch.allclose(eager_logits, sdpa_logits, atol=atol, rtol=rtol)
-                    if not is_close:
-                        fail_cases.append(get_mean_reldiff(key, current_case, sdpa_logits, eager_logits, atol, rtol))
-
-            self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
-
 @require_torch
-class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
+class ViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as AIMv2 does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
-    all_model_classes = (AIMv2VisionModel, AIMv2VisionModelWithProjection) if is_torch_available() else ()
-    fx_compatible = False
+    all_model_classes = (
+        (
+            AIMv2Model,
+            ViTForImageClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {"image-feature-extraction": AIMv2Model, "image-classification": ViTForImageClassification}
+        if is_torch_available()
+        else {}
+    )
+    fx_compatible = True
+
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
+    test_torch_exportable = True
 
     def setUp(self):
-        self.model_tester = AIMv2VisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=AIMv2VisionConfig, has_text_modality=False, hidden_size=37
-        )
+        self.model_tester = ViTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=37)
+
+    @unittest.skip(
+        "Since `torch==2.3+cu121`, although this test passes, many subsequent tests have `CUDA error: misaligned address`."
+        "If `nvidia-xxx-cu118` are also installed, no failure (even with `torch==2.3+cu121`)."
+    )
+    def test_multi_gpu_data_parallel_forward(self):
+        super().test_multi_gpu_data_parallel_forward()
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="AIMv2 does not use inputs_embeds")
+    @unittest.skip(reason="ViT does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -408,737 +216,97 @@ class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_model_with_projection(self):
+
+    def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
-
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="AIMv2VisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="AIMv2VisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
     @slow
     def test_model_from_pretrained(self):
-        model_name = "apple/aimv2-large-patch14-224"
-        model = AIMv2VisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_model_with_projection_from_pretrained(self):
-        model_name = "apple/aimv2-large-patch14-224"
-        model = AIMv2VisionModelWithProjection.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertTrue(hasattr(model, "visual_projection"))
-
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        super().test_eager_matches_sdpa_inference(
-            torch_dtype=torch_dtype,
-            logit_keys=("last_hidden_state", "pooler_output", "image_embeds"),
-            use_attention_mask_options=(None,),
-        )
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_composite_models(self):
-        super().test_sdpa_can_dispatch_composite_models()
-
-
-class AIMv2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :start_index] = 1
-                input_mask[batch_idx, start_index:] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return AIMv2TextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = AIMv2TextModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            result = model(input_ids, attention_mask=input_mask)
-            result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_with_projection(self, config, input_ids, input_mask):
-        model = AIMv2TextModelWithProjection(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            result = model(input_ids, attention_mask=input_mask)
-            result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_torch
-class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (AIMv2TextModel, AIMv2TextModelWithProjection) if is_torch_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = AIMv2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AIMv2TextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_projection(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
-
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="AIMv2 does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="AIMv2TextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="AIMv2TextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "apple/aimv2-large-patch14-224"
-        model = AIMv2TextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_model_with_projection_from_pretrained(self):
-        model_name = "apple/aimv2-large-patch14-224"
-        model = AIMv2TextModelWithProjection.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertTrue(hasattr(model, "text_projection"))
-
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        super().test_eager_matches_sdpa_inference(
-            torch_dtype=torch_dtype,
-            logit_keys=("last_hidden_state", "pooler_output", "text_embeds"),
-            use_attention_mask_options=(None, "right"),  # "left" is not supported for text model
-        )
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_composite_models(self):
-        super().test_sdpa_can_dispatch_composite_models()
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_on_flash(self):
-        self.skipTest(reason="AIMv2TextModel has two attention masks: `causal_attention_mask` and `attention_mask`")
-
-
-class AIMv2ModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = AIMv2TextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = AIMv2VisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return AIMv2Config.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = AIMv2Model(config).to(torch_device).eval()
-        with torch.no_grad():
-            result = model(input_ids, pixel_values, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_torch
-class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (AIMv2Model,) if is_torch_available() else ()
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    _is_composite = True
-
-    def setUp(self):
-        self.model_tester = AIMv2ModelTester(self)
-        common_properties = ["projection_dim", "logit_scale_init_value"]
-        self.config_tester = ConfigTester(
-            self, config_class=AIMv2Config, has_text_modality=False, common_properties=common_properties
-        )
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="AIMv2Model does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for AIMv2
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        if not self.test_torchscript:
-            self.skipTest(reason="test_torchscript is set to False")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        configs_no_init.return_dict = False
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-
-            try:
-                input_ids = inputs_dict["input_ids"]
-                pixel_values = inputs_dict["pixel_values"]  # AIMv2 needs pixel_values
-                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
-
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-
-                try:
-                    torch.jit.save(traced_model, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
-
-                try:
-                    loaded_model = torch.jit.load(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
-
-            model.to(torch_device)
-            model.eval()
-
-            loaded_model.to(torch_device)
-            loaded_model.eval()
-
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
-
-            non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
-                    non_persistent_buffers[key] = loaded_model_state_dict[key]
-
-            loaded_model_state_dict = {
-                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-            }
-
-            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
-
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if torch.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
-
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
-
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                p2 = loaded_model_state_dict[layer_name]
-                if p1.data.ne(p2.data).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save AIMv2Config and check if we can load AIMv2VisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = AIMv2VisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save AIMv2Config and check if we can load AIMv2TextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = AIMv2TextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "apple/aimv2-large-patch14-224"
+        model_name = "google/vit-base-patch16-224"
         model = AIMv2Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        super().test_eager_matches_sdpa_inference(
-            torch_dtype=torch_dtype,
-            logit_keys=("logits_per_image", "logits_per_text"),
-            use_attention_mask_options=(None, "right"),  # "left" is not supported for text model
-        )
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_composite_models(self):
-        super().test_sdpa_can_dispatch_composite_models()
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_on_flash(self):
-        self.skipTest(reason="AIMv2 text tower has two attention masks: `causal_attention_mask` and `attention_mask`")
-
-    @require_torch_sdpa
-    def test_sdpa_can_compile_dynamic(self):
-        self.skipTest(reason="AIMv2 model can't be compiled dynamic, error in aimv2_loss`")
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_inference_equivalence(self):
-        for model_class in self.all_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
-                )
-                model_fa.to(torch_device)
-
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
-                model.to(torch_device)
-
-                dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16)
-                dummy_input_ids = inputs_dict["input_ids"]
-
-                outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True)
-                outputs_fa = model_fa(
-                    pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True
-                )
-
-                self.assertTrue(
-                    torch.allclose(outputs.logits_per_image, outputs_fa.logits_per_image, atol=4e-2, rtol=4e-2),
-                    f"Image logits max diff: {torch.max(torch.abs(outputs.logits_per_image - outputs_fa.logits_per_image))}",
-                )
-                self.assertTrue(
-                    torch.allclose(outputs.logits_per_text, outputs_fa.logits_per_text, atol=4e-2, rtol=4e-2),
-                    f"Text logits max diff: {torch.max(torch.abs(outputs.logits_per_text - outputs_fa.logits_per_text))}",
-                )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    def test_flash_attn_2_inference_equivalence_right_padding(self):
-        for model_class in self.all_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
-                )
-                model_fa.to(torch_device)
-
-                model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="eager"
-                )
-                model.to(torch_device)
-
-                dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16)
-                dummy_input_ids = inputs_dict["input_ids"]
-                dummy_pixel_mask = inputs_dict["attention_mask"]
-
-                # right padding
-                dummy_pixel_mask[:] = 1
-                dummy_pixel_mask[:, -1:] = 0
-
-                outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True)
-                outputs_fa = model_fa(
-                    pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True
-                )
-
-                logits_per_image_eager = outputs.logits_per_image[:, :-1]
-                logits_per_text_eager = outputs.logits_per_text[:, :-1]
-
-                logits_per_image_sdpa = outputs_fa.logits_per_image[:, :-1]
-                logits_per_text_sdpa = outputs_fa.logits_per_text[:, :-1]
-
-                self.assertTrue(
-                    torch.allclose(logits_per_image_eager, logits_per_image_sdpa, atol=4e-2, rtol=4e-2),
-                    f"Image logits max diff: {torch.max(torch.abs(logits_per_image_eager - logits_per_image_sdpa))}",
-                )
-                self.assertTrue(
-                    torch.allclose(logits_per_text_eager, logits_per_text_sdpa, atol=4e-2, rtol=4e-2),
-                    f"Text logits max diff: {torch.max(torch.abs(logits_per_text_eager - logits_per_text_sdpa))}",
-                )
-
-
-class AIMv2ForImageClassificationModelTester(AIMv2ModelTester):
-    def __init__(self, parent):
-        super().__init__(parent)
-        self.batch_size = self.vision_model_tester.batch_size
-        self.num_hidden_layers = self.vision_model_tester.num_hidden_layers
-        self.hidden_size = self.vision_model_tester.hidden_size
-        self.seq_length = self.vision_model_tester.seq_length
-
-    def prepare_config_and_inputs(self):
-        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_torch
-class AIMv2ForImageClassificationModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (AIMv2ForImageClassification,) if is_torch_available() else ()
-    pipeline_model_mapping = {"image-classification": AIMv2ForImageClassification} if is_torch_available() else {}
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    _is_composite = True
-
-    def setUp(self):
-        self.model_tester = AIMv2ForImageClassificationModelTester(self)
-
-    @unittest.skip(reason="AIMv2ForImageClassification does not support inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="AIMv2ForImageClassification does not support inputs_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="AIMv2ForImageClassification does not support gradient checkpointing yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(reason="AIMv2ForImageClassification does not support gradient checkpointing yet")
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(reason="AIMv2ForImageClassification does not support gradient checkpointing yet")
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="AIMv2 uses the same initialization scheme as the Flax original implementation")
-    def test_initialization(self):
-        pass
-
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        super().test_eager_matches_sdpa_inference(
-            torch_dtype=torch_dtype,
-            logit_keys=("logits",),
-            use_attention_mask_options=(None,),
-        )
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_composite_models(self):
-        super().test_sdpa_can_dispatch_composite_models()
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
 
 
-@require_vision
 @require_torch
-class AIMv2ModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "apple/aimv2-large-patch14-224"
-        model = AIMv2Model.from_pretrained(model_name).to(torch_device)
-        processor = CLIPProcessor.from_pretrained(model_name)
+@require_vision
+class ViTModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
 
+    @slow
+    def test_inference_image_classification_head(self):
+        model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
+
+        image_processor = self.default_image_processor
         image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
-        ).to(torch_device)
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
 
         # forward pass
         with torch.no_grad():
             outputs = model(**inputs)
 
         # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
-        )
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_logits = torch.tensor([[24.5701, 19.3049]], device=torch_device)
+        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
 
-        torch.testing.assert_close(outputs.logits_per_image, expected_logits, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
 
     @slow
     def test_inference_interpolate_pos_encoding(self):
-        # AIMv2 models have an `interpolate_pos_encoding` argument in their forward method,
+        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
         # allowing to interpolate the pre-trained position embeddings in order to use
         # the model on higher resolutions. The DINO model by Facebook AI leverages this
         # to visualize self-attention on higher resolution images.
-        model = AIMv2Model.from_pretrained("apple/aimv2-large-patch14-224").to(torch_device)
+        model = AIMv2Model.from_pretrained("facebook/dino-vits8").to(torch_device)
 
-        processor = CLIPProcessor.from_pretrained(
-            "apple/aimv2-large-patch14-224",
-            size={"height": 180, "width": 180},
-            crop_size={"height": 180, "width": 180},
-        )
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = processor(text="what's in the image", images=image, return_tensors="pt").to(torch_device)
-
-        # interpolate_pos_encodiung false should return value error
-        with self.assertRaises(ValueError, msg="doesn't match model"):
-            with torch.no_grad():
-                model(**inputs, interpolate_pos_encoding=False)
+        image_processor = AutoImageProcessor.from_pretrained("facebook/dino-vits8", size=480)
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device)
 
         # forward pass
         with torch.no_grad():
-            outputs = model(**inputs, interpolate_pos_encoding=True)
+            outputs = model(pixel_values, interpolate_pos_encoding=True)
 
         # verify the logits
-        expected_shape = torch.Size((1, 26, 768))
-
-        self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
+        expected_shape = torch.Size((1, 3601, 384))
+        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
 
         expected_slice = torch.tensor(
-            [[-0.1538, 0.0322, -0.3235], [0.2893, 0.1135, -0.5708], [0.0461, 0.1540, -0.6018]]
+            [[4.2325, 4.3882, -6.6678], [4.5372, 1.8933, -6.7355], [4.4454, 0.8514, -5.8747]]
         ).to(torch_device)
 
-        torch.testing.assert_close(
-            outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-4, atol=1e-4
-        )
+        torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-3, atol=1e-3)
+
+    @slow
+    @require_accelerate
+    @require_torch_accelerator
+    @require_torch_fp16
+    def test_inference_fp16(self):
+        r"""
+        A small test to make sure that inference work in half precision without any problem.
+        """
+        model = AIMv2Model.from_pretrained("facebook/dino-vits8", torch_dtype=torch.float16, device_map="auto")
+        image_processor = self.default_image_processor
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs.pixel_values.to(torch_device)
+
+        # forward pass to make sure inference works in fp16
+        with torch.no_grad():
+            _ = model(pixel_values)

From daac33803aeaf5477b7590774aa4f118f1df7dce Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Mon, 10 Mar 2025 11:38:33 +0530
Subject: [PATCH 05/62] Added support for aimv2-native

---
 .../convert_aimv2_original_pytorch_to_hf.py   | 14 +++--
 .../models/aimv2/modeling_aimv2.py            | 52 ++++++++++++------
 .../models/aimv2/modular_aimv2.py             | 53 +++++++++++++------
 tests/models/aimv2/test_modeling_aimv2.py     |  1 +
 4 files changed, 81 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index 63c6fad70d7..828e9e6f579 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -100,7 +100,7 @@ def write_model(
     config = AIMv2Config.from_pretrained(hf_repo_id)
 
     # Load original model state dict
-    original_state_dict = load_original_state_dict("apple/aimv2-large-patch14-224")
+    original_state_dict = load_original_state_dict(hf_repo_id)
 
     print("Converting model...")
     state_dict = {}
@@ -117,13 +117,17 @@ def write_model(
         else:
             state_dict[new_key] = value
 
-    state_dict["embeddings.position_embeddings.weight"] = state_dict["embeddings.position_embeddings.weight"].squeeze(
-        0
-    )
+    # Check if position embeddings exist before squeezing
+    if "embeddings.position_embeddings.weight" in state_dict:
+        state_dict["embeddings.position_embeddings.weight"] = state_dict["embeddings.position_embeddings.weight"].squeeze(0)
+        strict_loading = True
+    else:
+        # For `apple/aimv2-large-patch14-native` we don't have position_embeddings in state_dict
+        strict_loading = False
 
     print("Loading the checkpoint in a DepthPro model.")
     model = AIMv2Model(config)
-    model.load_state_dict(state_dict, strict=True, assign=True)
+    model.load_state_dict(state_dict, strict=strict_loading, assign=True)
     print("Checkpoint loaded successfully.")
 
     print("Saving the model.")
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 2fbf5ec2ac5..839c6b58489 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -78,6 +78,8 @@ class AIMv2SwiGLUFFN(nn.Module):
 class AIMv2Embeddings(nn.Module):
     def __init__(self, config: AIMv2Config):
         super().__init__()
+        self.config = config
+        self.patch_size = config.patch_size
         self.patch_embed = nn.Conv2d(
             config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
         )
@@ -88,36 +90,52 @@ class AIMv2Embeddings(nn.Module):
         self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
 
     @staticmethod
-    def build_2d_sincos_position_embedding(height, width, embed_dim):
-        pass
+    def build_2d_sincos_position_embedding(
+        height, width, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
+    ):
+        grid_w = torch.arange(int(width), dtype=dtype, device=device)
+        grid_h = torch.arange(int(height), dtype=dtype, device=device)
+        grid_h, grid_w = torch.meshgrid(grid_w, grid_h, indexing="xy")
+
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim
+        omega = 1.0 / (temperature**omega)
+
+        out_h = grid_h.flatten()[..., None] @ omega[None, :]
+        out_w = grid_w.flatten()[..., None] @ omega[None, :]
+
+        return torch.concat([out_h.sin(), out_h.cos(), out_w.sin(), out_w.cos()], dim=1)[None, :, :]
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        _, _, height, width = pixel_values.size()
         hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2)
         hidden_states = self.rms_norm(hidden_states)
 
-        _, num_patches, _ = hidden_states.size()
-
-        # added logic for native in build s2d sincos pos embed
-        hidden_states = hidden_states + self.position_embeddings(self.position_ids)
+        if self.config.image_size != height or self.config.image_size != width:
+            pos_embed = self.build_2d_sincos_position_embedding(
+                height // self.patch_size, width // self.patch_size, embed_dim=self.config.hidden_size
+            )
+        else:
+            pos_embed = self.position_embeddings(self.position_ids)
 
+        hidden_states = hidden_states + pos_embed
         return hidden_states
 
 
-# Replace attn_mask with head mask
 def eager_attention_forward(
     module: nn.Module,
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
+    head_mask: Optional[torch.Tensor],
     scaling: float,
     dropout: float = 0.0,
     **kwargs,
 ):
     attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling
 
-    if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
+    if head_mask is not None:
+        attn_weights = attn_weights + head_mask
 
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
 
@@ -154,7 +172,7 @@ class AIMv2Attention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -185,7 +203,7 @@ class AIMv2Attention(nn.Module):
             query_states,
             key_states,
             value_states,
-            attention_mask,
+            head_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
             scaling=self.scale,
             is_causal=False,
@@ -211,11 +229,11 @@ class AIMv2EncoderLayer(nn.Module):
         self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
     def forward(
-        self, hidden_states: torch.Tensor, attention_mask, output_attentions: Optional[bool] = False
+        self, hidden_states: torch.Tensor, head_mask, output_attentions: Optional[bool] = False
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
         attn_output, attn_wights = self.attention(
-            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+            hidden_states=norm_hidden_states, head_mask=head_mask, output_attentions=output_attentions
         )
 
         hidden_states = hidden_states + attn_output
@@ -353,12 +371,12 @@ class AIMv2Model(AIMv2PreTrainedModel):
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         # Initialize weights and apply final processing
-        self.post_init()
+        # self.post_init()
 
     def forward(
         self,
         pixel_values,
-        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -373,7 +391,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
+            head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 69419cd3c9c..f3928e6fca9 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -61,6 +61,8 @@ class AIMv2SwiGLUFFN(nn.Module):
 class AIMv2Embeddings(nn.Module):
     def __init__(self, config: AIMv2Config):
         super().__init__()
+        self.config = config
+        self.patch_size = config.patch_size
         self.patch_embed = nn.Conv2d(
             config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
         )
@@ -71,35 +73,52 @@ class AIMv2Embeddings(nn.Module):
         self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
 
     @staticmethod
-    def build_2d_sincos_position_embedding(height, width, embed_dim):
-        pass
+    def build_2d_sincos_position_embedding(
+        height, width, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
+    ):
+        grid_w = torch.arange(int(width), dtype=dtype, device=device)
+        grid_h = torch.arange(int(height), dtype=dtype, device=device)
+        grid_h, grid_w = torch.meshgrid(grid_w, grid_h, indexing="xy")
+
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim
+        omega = 1.0 / (temperature**omega)
+
+        out_h = grid_h.flatten()[..., None] @ omega[None, :]
+        out_w = grid_w.flatten()[..., None] @ omega[None, :]
+
+        return torch.concat([out_h.sin(), out_h.cos(), out_w.sin(), out_w.cos()], dim=1)[None, :, :]
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        _, _, height, width = pixel_values.size()
         hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2)
         hidden_states = self.rms_norm(hidden_states)
 
-        _, num_patches, _ = hidden_states.size()
-
-        # added logic for native in build s2d sincos pos embed
-        hidden_states = hidden_states + self.position_embeddings(self.position_ids)
+        if self.config.image_size != height or self.config.image_size != width:
+            pos_embed = self.build_2d_sincos_position_embedding(
+                height // self.patch_size, width // self.patch_size, embed_dim=self.config.hidden_size
+            )
+        else:
+            pos_embed = self.position_embeddings(self.position_ids)
 
+        hidden_states = hidden_states + pos_embed
         return hidden_states
 
-# Replace atttn_mask with head mask
+
 def eager_attention_forward(
     module: nn.Module,
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
+    head_mask: Optional[torch.Tensor],
     scaling: float,
     dropout: float = 0.0,
     **kwargs,
 ):
     attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling
 
-    if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
+    if head_mask is not None:
+        attn_weights = attn_weights + head_mask
 
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
 
@@ -136,7 +155,7 @@ class AIMv2Attention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -167,7 +186,7 @@ class AIMv2Attention(nn.Module):
             query_states,
             key_states,
             value_states,
-            attention_mask,
+            head_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
             scaling=self.scale,
             is_causal=False,
@@ -193,11 +212,11 @@ class AIMv2EncoderLayer(nn.Module):
         self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
     def forward(
-        self, hidden_states: torch.Tensor, attention_mask, output_attentions: Optional[bool] = False
+        self, hidden_states: torch.Tensor, head_mask, output_attentions: Optional[bool] = False
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
         attn_output, attn_wights = self.attention(
-            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+            hidden_states=norm_hidden_states, head_mask=head_mask, output_attentions=output_attentions
         )
 
         hidden_states = hidden_states + attn_output
@@ -251,12 +270,12 @@ class AIMv2Model(AIMv2PreTrainedModel):
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         # Initialize weights and apply final processing
-        self.post_init()
+        # self.post_init()
 
     def forward(
         self,
         pixel_values,
-        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -271,7 +290,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
+            head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 9e1690238c5..7ff0fd4f3c5 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -42,6 +42,7 @@ if is_torch_available():
 
 if is_vision_available():
     from PIL import Image
+
     from transformers import AutoImageProcessor
 
 

From 20f43aab77d1be2d81efc42b8112de7f732dd4c3 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Mon, 10 Mar 2025 13:09:25 +0530
Subject: [PATCH 06/62] More changes

---
 docs/source/en/model_doc/aimv2.md             |  23 ---
 src/transformers/__init__.py                  |   4 -
 .../models/aimv2/configuration_aimv2.py       |  95 ++++++++--
 .../models/aimv2/modeling_aimv2.py            | 169 ++++++++++++++++--
 .../models/aimv2/modular_aimv2.py             |  62 ++++++-
 5 files changed, 289 insertions(+), 64 deletions(-)

diff --git a/docs/source/en/model_doc/aimv2.md b/docs/source/en/model_doc/aimv2.md
index 1c33a06c927..917e6d8d816 100644
--- a/docs/source/en/model_doc/aimv2.md
+++ b/docs/source/en/model_doc/aimv2.md
@@ -40,38 +40,15 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 ## AIMv2TextConfig
 
-[[autodoc]] AIMv2TextConfig
-
-## AIMv2VisionConfig
-
-[[autodoc]] AIMv2VisionConfig
-
-## AIMv2Model
 
 [[autodoc]] AIMv2Model
     - forward
-    - get_text_features
-    - get_image_features
 
 ## AIMv2TextModel
 
 [[autodoc]] AIMv2TextModel
     - forward
 
-## AIMv2TextModelWithProjection
-
-[[autodoc]] AIMv2TextModelWithProjection
-    - forward
-
-## AIMv2VisionModelWithProjection
-
-[[autodoc]] AIMv2VisionModelWithProjection
-    - forward
-
-## AIMv2VisionModel
-
-[[autodoc]] AIMv2VisionModel
-    - forward
 
 ## AIMv2ForImageClassification
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4a3a5c3e1db..18b89cd8f66 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1862,10 +1862,6 @@ else:
             "AIMv2ForImageClassification",
             "AIMv2Model",
             "AIMv2PreTrainedModel",
-            "AIMv2TextModel",
-            "AIMv2TextModelWithProjection",
-            "AIMv2VisionModel",
-            "AIMv2VisionModelWithProjection",
         ]
     )
     _import_structure["models.clipseg"].extend(
diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index 11c7c024823..cd1c7fe30f5 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -1,5 +1,11 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/aimv2/modular_aimv2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_aimv2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2025 Apple Inc. and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +18,69 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""AIMv2 model configuration"""
+
 
 from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
 
 
 class AIMv2Config(PretrainedConfig):
-    model_type: str = "aimv2"
+    r"""
+    This is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to instantiate an AIMv2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the AIMv2
+    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        encoder_stride (`int`, *optional*, defaults to 16):
+           Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+
+    Example:
+
+    ```python
+    >>> from transformers import AIMv2Config, AIMv2Model
+
+    >>> # Initializing a AIMv2 aimv2-base-patch16-224 style configuration
+    >>> configuration = AIMv2Config()
+
+    >>> # Initializing a model (with random weights) from the aimv2-base-patch16-224 style configuration
+    >>> model = AIMv2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "aimv2"
 
     def __init__(
         self,
@@ -36,28 +94,29 @@ class AIMv2Config(PretrainedConfig):
         rms_norm_eps: float = 1e-5,
         attention_dropout: float = 0.0,
         projection_dropout: float = 0.0,
-        attention_bias: bool = False,
+        qkv_bias: bool = False,
         use_bias: bool = False,
         hidden_act="silu",
         initializer_range=0.02,
         **kwargs,
     ):
         super().__init__(**kwargs)
+
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.rms_norm_eps = rms_norm_eps
-
-        self.projection_dropout = projection_dropout
-        self.attention_bias = attention_bias
-        self.use_bias = use_bias
+        self.intermediate_size = intermediate_size
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+        self.projection_dropout = projection_dropout
+        self.use_bias = use_bias
 
 
 __all__ = ["AIMv2Config"]
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 839c6b58489..2f478192556 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -24,17 +24,27 @@ from typing import Callable, Optional, Tuple, Union
 
 import torch
 from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
-from ...utils import logging
+from ...modeling_outputs import ImageClassifierOutput
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_aimv2 import AIMv2Config
 
 
 logger = logging.get_logger(__name__)
 
+# General docstring
+_CONFIG_FOR_DOC = "AIMv2Config"
+
 
 class AIMv2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
@@ -163,10 +173,10 @@ class AIMv2Attention(nn.Module):
             )
         self.scale = self.head_dim**-0.5
 
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
-        self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
+        self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.proj_drop = nn.Dropout(config.projection_dropout)
 
     def forward(
@@ -355,11 +365,11 @@ class AIMv2PreTrainedModel(PreTrainedModel):
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, AIMv2Embeddings):
-            module.position_embeddings.data = nn.init.trunc_normal_(
-                module.position_embeddings.data.to(torch.float32),
+            module.position_embeddings = nn.init.trunc_normal_(
+                module.position_embeddings.weight.to(torch.float32),
                 mean=0.0,
                 std=self.config.initializer_range,
-            ).to(module.position_embeddings.dtype)
+            ).to(module.position_embeddings.weight.dtype)
 
 
 class AIMv2Model(AIMv2PreTrainedModel):
@@ -371,7 +381,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         # Initialize weights and apply final processing
-        # self.post_init()
+        self.post_init()
 
     def forward(
         self,
@@ -407,4 +417,143 @@ class AIMv2Model(AIMv2PreTrainedModel):
         )
 
 
-__all__ = ["AIMv2Model"]
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/aimv2-small-imagenet1k-1-layer"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+AIMV2_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`AIMv2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+AIMV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """
+    AIMv2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """,
+    AIMV2_START_DOCSTRING,
+)
+class AIMv2ForImageClassification(AIMv2PreTrainedModel):
+    def __init__(self, config: AIMv2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.aimv2 = AIMv2Model(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(AIMV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.aimv2(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
+
+        cls_token = sequence_output[:, 0]
+        patch_tokens = sequence_output[:, 1:]
+
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+
+        logits = self.classifier(linear_input)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["AIMv2Model", "AIMv2ForImageClassification"]
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index f3928e6fca9..7108b81b747 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -27,8 +27,10 @@ from ...activations import ACT2FN
 from ...utils import (
     logging,
 )
+from ..dinov2.modeling_dinov2 import Dinov2ForImageClassification
 from ..llama.modeling_llama import LlamaRMSNorm
 from ..siglip.modeling_siglip import SiglipEncoder
+from ..vit.configuration_vit import ViTConfig
 from .configuration_aimv2 import AIMv2Config
 
 
@@ -146,10 +148,10 @@ class AIMv2Attention(nn.Module):
             )
         self.scale = self.head_dim**-0.5
 
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
-        self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
+        self.proj_out = nn.Linear(self.embed_dim,self.embed_dim,bias=config.qkv_bias)
         self.proj_drop = nn.Dropout(config.projection_dropout)
 
     def forward(
@@ -254,11 +256,11 @@ class AIMv2PreTrainedModel(PreTrainedModel):
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, AIMv2Embeddings):
-            module.position_embeddings.data = nn.init.trunc_normal_(
-                module.position_embeddings.data.to(torch.float32),
+            module.position_embeddings = nn.init.trunc_normal_(
+                module.position_embeddings.weight.to(torch.float32),
                 mean=0.0,
                 std=self.config.initializer_range,
-            ).to(module.position_embeddings.dtype)
+            ).to(module.position_embeddings.weight.dtype)
 
 
 class AIMv2Model(AIMv2PreTrainedModel):
@@ -270,7 +272,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         # Initialize weights and apply final processing
-        # self.post_init()
+        self.post_init()
 
     def forward(
         self,
@@ -305,5 +307,47 @@ class AIMv2Model(AIMv2PreTrainedModel):
             attentions=encoder_outputs.attentions,
         )
 
+class AIMv2ForImageClassification(Dinov2ForImageClassification):
+    pass
 
-__all__ = ["AIMv2Model"]
+
+class AIMv2Config(ViTConfig):
+    def __init__(self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        hidden_act="silu",
+        initializer_range=0.02,
+        **kwargs,):
+        super().__init__(hidden_size=hidden_size,
+                         intermediate_size=intermediate_size,
+                         num_hidden_layers=num_hidden_layers,
+                         num_attention_heads=num_attention_heads,
+                         hidden_act=hidden_act,
+                         num_channels=num_channels,
+                         image_size=image_size,
+                         patch_size=patch_size,
+                         qkv_bias=qkv_bias,
+                         initializer_range=initializer_range,
+                         **kwargs,)
+
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+        self.projection_dropout = projection_dropout
+        self.use_bias = use_bias
+
+        del self.attention_probs_dropout_prob
+        del self.layer_norm_eps
+        del self.encoder_stride
+        del self.hidden_dropout_prob
+
+__all__ = ["AIMv2Config","AIMv2Model","AIMv2ForImageClassification"]

From 4a0b44244450c93d25a9e19774a299f2969e943e Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Mon, 10 Mar 2025 21:51:08 +0530
Subject: [PATCH 07/62] More changes

---
 .../convert_aimv2_original_pytorch_to_hf.py   |   4 +-
 .../models/aimv2/modeling_aimv2.py            |  25 ++-
 .../models/aimv2/modular_aimv2.py             |  63 +++---
 tests/models/aimv2/test_modeling_aimv2.py     | 199 +++++++-----------
 4 files changed, 120 insertions(+), 171 deletions(-)

diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index 828e9e6f579..58f96fb906e 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -119,7 +119,9 @@ def write_model(
 
     # Check if position embeddings exist before squeezing
     if "embeddings.position_embeddings.weight" in state_dict:
-        state_dict["embeddings.position_embeddings.weight"] = state_dict["embeddings.position_embeddings.weight"].squeeze(0)
+        state_dict["embeddings.position_embeddings.weight"] = state_dict[
+            "embeddings.position_embeddings.weight"
+        ].squeeze(0)
         strict_loading = True
     else:
         # For `apple/aimv2-large-patch14-native` we don't have position_embeddings in state_dict
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 2f478192556..44bb7b7200d 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -165,6 +165,7 @@ class AIMv2Attention(nn.Module):
         self.config = config
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
+        self.attention_dropout = config.attention_dropout
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
@@ -239,7 +240,10 @@ class AIMv2EncoderLayer(nn.Module):
         self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
     def forward(
-        self, hidden_states: torch.Tensor, head_mask, output_attentions: Optional[bool] = False
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
         attn_output, attn_wights = self.attention(
@@ -355,21 +359,16 @@ class AIMv2PreTrainedModel(PreTrainedModel):
     _no_split_modules = ["AIMv2SwiGLUFFN"]
     _supports_sdpa = True
 
-    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
+    def _init_weights(self, module):
+        std = self.config.initializer_range
         if isinstance(module, (nn.Linear, nn.Conv2d)):
-            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
-            # `trunc_normal_cpu` not implemented in `half` issues
-            module.weight.data = nn.init.trunc_normal_(
-                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
-            ).to(module.weight.dtype)
+            module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, AIMv2Embeddings):
-            module.position_embeddings = nn.init.trunc_normal_(
-                module.position_embeddings.weight.to(torch.float32),
-                mean=0.0,
-                std=self.config.initializer_range,
-            ).to(module.position_embeddings.weight.dtype)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
 
 
 class AIMv2Model(AIMv2PreTrainedModel):
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 7108b81b747..9be829feeb6 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -15,7 +15,7 @@
 
 """Pytorch implementation of AIMv2 Model"""
 
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Optional, Tuple
 
 import torch
 from torch import nn
@@ -140,6 +140,7 @@ class AIMv2Attention(nn.Module):
         self.config = config
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
+        self.attention_dropout = config.attention_dropout
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
@@ -151,7 +152,7 @@ class AIMv2Attention(nn.Module):
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
-        self.proj_out = nn.Linear(self.embed_dim,self.embed_dim,bias=config.qkv_bias)
+        self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.proj_drop = nn.Dropout(config.projection_dropout)
 
     def forward(
@@ -214,7 +215,10 @@ class AIMv2EncoderLayer(nn.Module):
         self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
     def forward(
-        self, hidden_states: torch.Tensor, head_mask, output_attentions: Optional[bool] = False
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
         attn_output, attn_wights = self.attention(
@@ -246,21 +250,16 @@ class AIMv2PreTrainedModel(PreTrainedModel):
     _no_split_modules = ["AIMv2SwiGLUFFN"]
     _supports_sdpa = True
 
-    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
+    def _init_weights(self, module):
+        std = self.config.initializer_range
         if isinstance(module, (nn.Linear, nn.Conv2d)):
-            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
-            # `trunc_normal_cpu` not implemented in `half` issues
-            module.weight.data = nn.init.trunc_normal_(
-                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
-            ).to(module.weight.dtype)
+            module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, AIMv2Embeddings):
-            module.position_embeddings = nn.init.trunc_normal_(
-                module.position_embeddings.weight.to(torch.float32),
-                mean=0.0,
-                std=self.config.initializer_range,
-            ).to(module.position_embeddings.weight.dtype)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
 
 
 class AIMv2Model(AIMv2PreTrainedModel):
@@ -307,12 +306,14 @@ class AIMv2Model(AIMv2PreTrainedModel):
             attentions=encoder_outputs.attentions,
         )
 
+
 class AIMv2ForImageClassification(Dinov2ForImageClassification):
     pass
 
 
 class AIMv2Config(ViTConfig):
-    def __init__(self,
+    def __init__(
+        self,
         hidden_size: int = 1024,
         intermediate_size: int = 2816,
         num_hidden_layers: int = 24,
@@ -327,18 +328,21 @@ class AIMv2Config(ViTConfig):
         use_bias: bool = False,
         hidden_act="silu",
         initializer_range=0.02,
-        **kwargs,):
-        super().__init__(hidden_size=hidden_size,
-                         intermediate_size=intermediate_size,
-                         num_hidden_layers=num_hidden_layers,
-                         num_attention_heads=num_attention_heads,
-                         hidden_act=hidden_act,
-                         num_channels=num_channels,
-                         image_size=image_size,
-                         patch_size=patch_size,
-                         qkv_bias=qkv_bias,
-                         initializer_range=initializer_range,
-                         **kwargs,)
+        **kwargs,
+    ):
+        super().__init__(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            hidden_act=hidden_act,
+            num_channels=num_channels,
+            image_size=image_size,
+            patch_size=patch_size,
+            qkv_bias=qkv_bias,
+            initializer_range=initializer_range,
+            **kwargs,
+        )
 
         self.attention_dropout = attention_dropout
         self.rms_norm_eps = rms_norm_eps
@@ -350,4 +354,5 @@ class AIMv2Config(ViTConfig):
         del self.encoder_stride
         del self.hidden_dropout_prob
 
-__all__ = ["AIMv2Config","AIMv2Model","AIMv2ForImageClassification"]
+
+__all__ = ["AIMv2Config", "AIMv2Model", "AIMv2ForImageClassification"]
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 7ff0fd4f3c5..8701838d635 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Testing suite for the PyTorch ViT model."""
+"""Testing suite for the PyTorch AIMv2 model."""
 
 import unittest
 
-from transformers import ViTConfig
+from transformers import AIMv2Config
 from transformers.testing_utils import (
-    require_accelerate,
+    is_flaky,
     require_torch,
-    require_torch_accelerator,
-    require_torch_fp16,
-    require_vision,
-    slow,
     torch_device,
 )
-from transformers.utils import cached_property, is_torch_available, is_vision_available
+from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
@@ -34,19 +30,16 @@ from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
-    import torch
     from torch import nn
 
-    from transformers import AIMv2Model
+    from transformers import AIMv2ForImageClassification, AIMv2Model
 
 
 if is_vision_available():
-    from PIL import Image
-
-    from transformers import AutoImageProcessor
+    pass
 
 
-class ViTModelTester:
+class AIMv2ModelTester:
     def __init__(
         self,
         parent,
@@ -61,14 +54,9 @@ class ViTModelTester:
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="silu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
         type_sequence_label_size=10,
         initializer_range=0.02,
         scope=None,
-        encoder_stride=2,
-        mask_ratio=0.5,
-        attn_implementation="eager",
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -82,20 +70,12 @@ class ViTModelTester:
         self.num_attention_heads = num_attention_heads
         self.intermediate_size = intermediate_size
         self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.scope = scope
-        self.encoder_stride = encoder_stride
-        self.attn_implementation = attn_implementation
 
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-        self.mask_ratio = mask_ratio
-        self.num_masks = int(mask_ratio * self.seq_length)
-        self.mask_length = num_patches
+        self.seq_length = num_patches
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
@@ -109,7 +89,7 @@ class ViTModelTester:
         return config, pixel_values, labels
 
     def get_config(self):
-        return ViTConfig(
+        return AIMv2Config(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
@@ -118,12 +98,8 @@ class ViTModelTester:
             num_attention_heads=self.num_attention_heads,
             intermediate_size=self.intermediate_size,
             hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             is_decoder=False,
             initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            attn_implementation=self.attn_implementation,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -133,10 +109,9 @@ class ViTModelTester:
         result = model(pixel_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-
     def create_and_check_for_image_classification(self, config, pixel_values, labels):
         config.num_labels = self.type_sequence_label_size
-        model = ViTForImageClassification(config)
+        model = AIMv2ForImageClassification(config)
         model.to(torch_device)
         model.eval()
         result = model(pixel_values, labels=labels)
@@ -144,7 +119,7 @@ class ViTModelTester:
 
         # test greyscale images
         config.num_channels = 1
-        model = ViTForImageClassification(config)
+        model = AIMv2ForImageClassification(config)
         model.to(torch_device)
         model.eval()
 
@@ -164,22 +139,24 @@ class ViTModelTester:
 
 
 @require_torch
-class ViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class Dinov2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as Dinov2 does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
+    test_torch_exportable = True
+
     all_model_classes = (
         (
             AIMv2Model,
-            ViTForImageClassification,
+            AIMv2ForImageClassification,
         )
         if is_torch_available()
         else ()
     )
     pipeline_model_mapping = (
-        {"image-feature-extraction": AIMv2Model, "image-classification": ViTForImageClassification}
+        {"image-feature-extraction": AIMv2Model, "image-classification": AIMv2ForImageClassification}
         if is_torch_available()
         else {}
     )
@@ -188,26 +165,40 @@ class ViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
-    test_torch_exportable = True
 
     def setUp(self):
-        self.model_tester = ViTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=37)
+        self.model_tester = AIMv2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AIMv2Config, has_text_modality=False, hidden_size=37)
 
-    @unittest.skip(
-        "Since `torch==2.3+cu121`, although this test passes, many subsequent tests have `CUDA error: misaligned address`."
-        "If `nvidia-xxx-cu118` are also installed, no failure (even with `torch==2.3+cu121`)."
-    )
-    def test_multi_gpu_data_parallel_forward(self):
-        super().test_multi_gpu_data_parallel_forward()
+    @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.")
+    def test_initialization(self):
+        super().test_initialization()
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="ViT does not use inputs_embeds")
+    @unittest.skip(reason="Dinov2 does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
     def test_model_get_set_embeddings(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -221,93 +212,45 @@ class ViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-
     def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
 
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/vit-base-patch16-224"
-        model = AIMv2Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
+    @unittest.skip(reason="Dinov2 does not support feedforward chunking yet")
+    def test_feed_forward_chunking(self):
+        pass
 
 
 # We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
+# def prepare_img():
+#     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+#     return image
 
+# @require_torch
+# @require_vision
+# class Dinov2ModelIntegrationTest(unittest.TestCase):
+#     @cached_property
+#     def default_image_processor(self):
+#         return AutoImageProcessor.from_pretrained("facebook/dinov2-base") if is_vision_available() else None
 
-@require_torch
-@require_vision
-class ViTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") if is_vision_available() else None
+#     @slow
+#     def test_inference_no_head(self):
+#         model = Dinov2Model.from_pretrained("facebook/dinov2-base").to(torch_device)
 
-    @slow
-    def test_inference_image_classification_head(self):
-        model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224").to(torch_device)
+#         image_processor = self.default_image_processor
+#         image = prepare_img()
+#         inputs = image_processor(image, return_tensors="pt").to(torch_device)
 
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+#         # forward pass
+#         with torch.no_grad():
+#             outputs = model(**inputs)
 
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs)
+#         # verify the last hidden states
+#         expected_shape = torch.Size((1, 257, 768))
+#         self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
 
-        # verify the logits
-        expected_shape = torch.Size((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor([-0.2744, 0.8215, -0.0836]).to(torch_device)
-
-        torch.testing.assert_close(outputs.logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
-        # allowing to interpolate the pre-trained position embeddings in order to use
-        # the model on higher resolutions. The DINO model by Facebook AI leverages this
-        # to visualize self-attention on higher resolution images.
-        model = AIMv2Model.from_pretrained("facebook/dino-vits8").to(torch_device)
-
-        image_processor = AutoImageProcessor.from_pretrained("facebook/dino-vits8", size=480)
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt")
-        pixel_values = inputs.pixel_values.to(torch_device)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(pixel_values, interpolate_pos_encoding=True)
-
-        # verify the logits
-        expected_shape = torch.Size((1, 3601, 384))
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[4.2325, 4.3882, -6.6678], [4.5372, 1.8933, -6.7355], [4.4454, 0.8514, -5.8747]]
-        ).to(torch_device)
-
-        torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-3, atol=1e-3)
-
-    @slow
-    @require_accelerate
-    @require_torch_accelerator
-    @require_torch_fp16
-    def test_inference_fp16(self):
-        r"""
-        A small test to make sure that inference work in half precision without any problem.
-        """
-        model = AIMv2Model.from_pretrained("facebook/dino-vits8", torch_dtype=torch.float16, device_map="auto")
-        image_processor = self.default_image_processor
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="pt")
-        pixel_values = inputs.pixel_values.to(torch_device)
-
-        # forward pass to make sure inference works in fp16
-        with torch.no_grad():
-            _ = model(pixel_values)
+#         expected_slice = torch.tensor(
+#             [[-2.2005, -0.4495, 1.0964], [-3.3959, -0.8942, -1.0315], [-2.9355, 1.1564, -0.7656]],
+#             device=torch_device,
+#         )
+#         torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-3, atol=1e-3)

From 11ed21d6b4978513a20d13e885355d7301736173 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 22 Mar 2025 23:09:13 +0530
Subject: [PATCH 08/62] Stupid mistake correction

---
 docs/source/en/_toctree.yml | 71 +------------------------------------
 1 file changed, 1 insertion(+), 70 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 7dfa4d33be4..937fcd51249 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1,4 +1,3 @@
-- sections:
 - sections:
   - local: index
     title: Transformers
@@ -7,11 +6,8 @@
   - local: quicktour
     title: Quickstart
   title: Get started
-- isExpanded: false
-  title: Get started
 - isExpanded: false
   sections:
-  - sections:
   - sections:
     - local: models
       title: Loading models
@@ -34,8 +30,6 @@
     - local: attention
       title: Attention mechanisms
     title: Models
-  - sections:
-    title: Models
   - sections:
     - local: fast_tokenizers
       title: Tokenizers
@@ -53,12 +47,8 @@
       title: Padding and truncation
     title: Preprocessors
   title: Base classes
-- isExpanded: false
-    title: Preprocessors
-  title: Base classes
 - isExpanded: false
   sections:
-  - sections:
   - sections:
     - local: pipeline_tutorial
       title: Pipeline
@@ -69,8 +59,6 @@
     - local: add_new_pipeline
       title: Adding a new pipeline
     title: Pipeline API
-  - sections:
-    title: Pipeline API
   - sections:
     - local: llm_tutorial
       title: Text generation
@@ -93,8 +81,6 @@
     - local: perplexity
       title: Perplexity of fixed-length models
     title: LLMs
-  - sections:
-    title: LLMs
   - sections:
     - local: conversations
       title: Chat basics
@@ -107,8 +93,6 @@
     - local: chat_extras
       title: Tools and RAG
     title: Chat with models
-  - sections:
-    title: Chat with models
   - sections:
     - local: perf_torch_compile
       title: torch.compile
@@ -121,17 +105,13 @@
     - local: tf_xla
       title: XLA
     title: Optimization
-    title: Optimization
   - local: agents
     title: Agents
   - local: tools
     title: Tools
   title: Inference
-- isExpanded: false
-  title: Inference
 - isExpanded: false
   sections:
-  - sections:
   - sections:
     - local: trainer
       title: Trainer
@@ -142,8 +122,6 @@
     - local: hpo_train
       title: Hyperparameter search
     title: Trainer API
-  - sections:
-    title: Trainer API
   - sections:
     - local: gpu_selection
       title: GPU selection
@@ -160,8 +138,6 @@
     - local: perf_train_gpu_many
       title: Parallelism methods
     title: Distributed training
-  - sections:
-    title: Distributed training
   - sections:
     - local: perf_train_gpu_one
       title: GPU
@@ -174,14 +150,11 @@
     - local: perf_hardware
       title: Build your own machine
     title: Hardware
-    title: Hardware
   - local: peft
     title: PEFT
   - local: model_memory_anatomy
     title: Model training anatomy
   title: Training
-- isExpanded: false
-  title: Training
 - isExpanded: false
   sections:
   - local: quantization/overview
@@ -225,8 +198,6 @@
   - local: quantization/contribute
     title: Contribute
   title: Quantization
-- isExpanded: false
-  title: Quantization
 - isExpanded: false
   sections:
   - local: serialization
@@ -238,12 +209,8 @@
   - local: torchscript
     title: TorchScript
   title: Export to production
-- isExpanded: false
-  title: Export to production
 - isExpanded: false
   sections:
-  - sections:
-    - sections:
   - sections:
     - sections:
       - local: tasks/sequence_classification
@@ -263,16 +230,12 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Natural language processing
-    - sections:
-      title: Natural language processing
     - sections:
       - local: tasks/audio_classification
         title: Audio classification
       - local: tasks/asr
         title: Automatic speech recognition
       title: Audio
-    - sections:
-      title: Audio
     - sections:
       - local: tasks/image_classification
         title: Image classification
@@ -299,8 +262,6 @@
       - local: tasks/knowledge_distillation_for_image_classification
         title: Knowledge Distillation for Computer Vision
       title: Computer vision
-    - sections:
-      title: Computer vision
     - sections:
       - local: tasks/image_captioning
         title: Image captioning
@@ -318,8 +279,6 @@
         title: Video-text-to-text
       title: Multimodal
     title: Task recipes
-      title: Multimodal
-    title: Task recipes
   - local: run_scripts
     title: Training scripts
   - local: glossary
@@ -333,8 +292,6 @@
   - local: troubleshooting
     title: Troubleshoot
   title: Resources
-- isExpanded: false
-  title: Resources
 - isExpanded: false
   sections:
   - local: contributing
@@ -344,11 +301,8 @@
   - local: pr_checks
     title: Pull request checks
   title: Contribute
-- isExpanded: false
-  title: Contribute
 - isExpanded: false
   sections:
-  - sections:
   - sections:
     - local: main_classes/agent
       title: Agents and Tools
@@ -397,9 +351,6 @@
     - local: main_classes/image_processor
       title: Image Processor
     title: Main classes
-  - sections:
-    - sections:
-    title: Main classes
   - sections:
     - sections:
       - local: model_doc/albert
@@ -713,8 +664,6 @@
       - local: model_doc/zamba2
         title: Zamba2
       title: Text models
-    - sections:
-      title: Text models
     - sections:
       - local: model_doc/beit
         title: BEiT
@@ -845,8 +794,6 @@
       - local: model_doc/zoedepth
         title: ZoeDepth
       title: Vision models
-    - sections:
-      title: Vision models
     - sections:
       - local: model_doc/audio-spectrogram-transformer
         title: Audio Spectrogram Transformer
@@ -917,8 +864,6 @@
       - local: model_doc/xlsr_wav2vec2
         title: XLSR-Wav2Vec2
       title: Audio models
-    - sections:
-      title: Audio models
     - sections:
       - local: model_doc/timesformer
         title: TimeSformer
@@ -928,8 +873,6 @@
         title: ViViT
       title: Video models
     - sections:
-      - local: model_doc/aimv2
-        title: AIMv2
       - local: model_doc/align
         title: ALIGN
       - local: model_doc/altclip
@@ -1077,16 +1020,12 @@
       - local: model_doc/xclip
         title: X-CLIP
       title: Multimodal models
-    - sections:
-      title: Multimodal models
     - sections:
       - local: model_doc/decision_transformer
         title: Decision Transformer
       - local: model_doc/trajectory_transformer
         title: Trajectory Transformer
       title: Reinforcement learning models
-    - sections:
-      title: Reinforcement learning models
     - sections:
       - local: model_doc/autoformer
         title: Autoformer
@@ -1099,16 +1038,11 @@
       - local: model_doc/time_series_transformer
         title: Time Series Transformer
       title: Time series models
-    - sections:
-      title: Time series models
     - sections:
       - local: model_doc/graphormer
         title: Graphormer
       title: Graph models
     title: Models
-  - sections:
-      title: Graph models
-    title: Models
   - sections:
     - local: internal/modeling_utils
       title: Custom Layers and Utilities
@@ -1131,7 +1065,4 @@
     - local: internal/time_series_utils
       title: Utilities for Time Series
     title: Internal helpers
-  title: API
-
-    title: Internal helpers
-  title: API
+  title: API
\ No newline at end of file

From 999be2a19885d7bc4949fefc0e63ef923e0b0c7b Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sun, 23 Mar 2025 00:25:33 +0530
Subject: [PATCH 09/62] Added config and refactor

---
 .../models/aimv2/configuration_aimv2.py       | 238 ++++++--
 .../models/aimv2/modeling_aimv2.py            | 539 ++++++++++++++----
 .../models/aimv2/modular_aimv2.py             | 340 +++++++++--
 3 files changed, 929 insertions(+), 188 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index cd1c7fe30f5..17eb9129665 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -21,66 +21,62 @@
 
 
 from ...configuration_utils import PretrainedConfig
+from ...utils import logging
 
 
-class AIMv2Config(PretrainedConfig):
+logger = logging.get_logger(__name__)
+
+
+class AIMv2VisionConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to instantiate an AIMv2
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the AIMv2
+    This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
+    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
     [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-
     Args:
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the layer normalization layers.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 16):
             The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        qkv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to add a bias to the queries, keys and values.
-        encoder_stride (`int`, *optional*, defaults to 16):
-           Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
 
     Example:
 
     ```python
-    >>> from transformers import AIMv2Config, AIMv2Model
+    >>> from transformers import AIMv2VisionConfig, AIMv2VisionModel
 
-    >>> # Initializing a AIMv2 aimv2-base-patch16-224 style configuration
-    >>> configuration = AIMv2Config()
+    >>> # Initializing a AIMv2VisionConfig with google/aimv2-base-patch16-224 style configuration
+    >>> configuration = AIMv2VisionConfig()
 
-    >>> # Initializing a model (with random weights) from the aimv2-base-patch16-224 style configuration
-    >>> model = AIMv2Model(configuration)
+    >>> # Initializing a AIMv2VisionModel (with random weights) from the google/aimv2-base-patch16-224 style configuration
+    >>> model = AIMv2VisionModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
 
-    model_type = "aimv2"
+    model_type = "aimv2_vision_model"
+    base_config_key = "vision_config"
 
     def __init__(
         self,
@@ -97,21 +93,111 @@ class AIMv2Config(PretrainedConfig):
         qkv_bias: bool = False,
         use_bias: bool = False,
         hidden_act="silu",
-        initializer_range=0.02,
         **kwargs,
     ):
         super().__init__(**kwargs)
 
         self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.image_size = image_size
-        self.patch_size = patch_size
         self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.use_bias = use_bias
         self.qkv_bias = qkv_bias
+        self.rms_norm_eps = rms_norm_eps
+        self.projection_dropout = projection_dropout
+
+
+class AIMv2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a
+    AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
+    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`AIMv2Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 64):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the padding token in the vocabulary.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the vocabulary.
+        projection_size (`int`, *optional*, defaults to `hidden_size`):
+            The size of the projection head.
+
+    Example:
+
+    ```python
+    >>> from transformers import AIMv2TextConfig, AIMv2TextModel
+
+    >>> # Initializing a AIMv2TextConfig with google/aimv2-base-patch16-224 style configuration
+    >>> configuration = AIMv2TextConfig()
+
+    >>> # Initializing a AIMv2TextModel (with random weights) from the google/aimv2-base-patch16-224 style configuration
+    >>> model = AIMv2TextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "aimv2_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size: int = 49408,
+        hidden_size: int = 768,
+        intermediate_size: int = 2048,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 6,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        pad_token_id=None,
+        bos_token_id=None,
+        eos_token_id: int = 49407,
+        max_position_embeddings: int = 77,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
 
         self.attention_dropout = attention_dropout
         self.rms_norm_eps = rms_norm_eps
@@ -119,4 +205,82 @@ class AIMv2Config(PretrainedConfig):
         self.use_bias = use_bias
 
 
-__all__ = ["AIMv2Config"]
+class AIMv2Config(PretrainedConfig):
+    r"""
+    [`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to
+    instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
+    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AIMv2TextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AIMv2VisionConfig`].
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import AIMv2Config, AIMv2Model
+
+    >>> # Initializing a AIMv2Config with google/aimv2-base-patch16-224 style configuration
+    >>> configuration = AIMv2Config()
+
+    >>> # Initializing a AIMv2Model (with random weights) from the google/aimv2-base-patch16-224 style configuration
+    >>> model = AIMv2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a AIMv2Config from a AIMv2TextConfig and a AIMv2VisionConfig
+    >>> from transformers import AIMv2TextConfig, AIMv2VisionConfig
+
+    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
+    >>> config_text = AIMv2TextConfig()
+    >>> config_vision = AIMv2VisionConfig()
+
+    >>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "aimv2"
+    sub_configs = {"text_config": AIMv2TextConfig, "vision_config": AIMv2VisionConfig}
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `AIMv2TextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `AIMv2VisionConfig` with default values.")
+
+        self.text_config = AIMv2TextConfig(**text_config)
+        self.vision_config = AIMv2VisionConfig(**vision_config)
+
+        self.initializer_factor = 1.0
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: AIMv2TextConfig, vision_config: AIMv2VisionConfig, **kwargs):
+        r"""
+        Instantiate a [`AIMv2Config`] (or a derived class) from aimv2 text model configuration and aimv2 vision
+        model configuration.
+
+        Returns:
+            [`AIMv2Config`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+
+__all__ = ["AIMv2Config", "AIMv2VisionConfig", "AIMv2TextConfig"]
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 44bb7b7200d..6d0c23937e0 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -20,30 +20,66 @@
 # limitations under the License.
 
 
-from typing import Callable, Optional, Tuple, Union
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Tuple, Union
 
 import torch
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
-from ...modeling_outputs import ImageClassifierOutput
+from ...modeling_outputs import BaseModelOutputWithPooling
 from ...utils import (
-    add_code_sample_docstrings,
+    ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
+    replace_return_docstrings,
 )
-from .configuration_aimv2 import AIMv2Config
+from .configuration_aimv2 import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
 
 
 logger = logging.get_logger(__name__)
 
-# General docstring
-_CONFIG_FOR_DOC = "AIMv2Config"
+
+@dataclass
+class AIMv2Output(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`AIMv2TextModel`].
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`AIMv2VisionModel`].
+        text_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`AIMv2TextModel`].
+        vision_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`AIMv2VisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
 
 
 class AIMv2RMSNorm(nn.Module):
@@ -67,7 +103,7 @@ class AIMv2RMSNorm(nn.Module):
 
 
 class AIMv2SwiGLUFFN(nn.Module):
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         in_features = config.hidden_size
         out_features = config.intermediate_size
@@ -85,8 +121,8 @@ class AIMv2SwiGLUFFN(nn.Module):
         return hidden_states
 
 
-class AIMv2Embeddings(nn.Module):
-    def __init__(self, config: AIMv2Config):
+class AIMv2VisionEmbeddings(nn.Module):
+    def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         self.config = config
         self.patch_size = config.patch_size
@@ -132,6 +168,46 @@ class AIMv2Embeddings(nn.Module):
         return hidden_states
 
 
+class AIMv2TextEmbeddings(nn.Module):
+    def __init__(self, config: AIMv2TextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
 def eager_attention_forward(
     module: nn.Module,
     query_states: torch.Tensor,
@@ -160,7 +236,7 @@ def eager_attention_forward(
 class AIMv2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -232,7 +308,7 @@ class AIMv2Attention(nn.Module):
 
 
 class AIMv2EncoderLayer(nn.Module):
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         self.attention = AIMv2Attention(config)
         self.ffn = AIMv2SwiGLUFFN(config)
@@ -371,11 +447,11 @@ class AIMv2PreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
 
 
-class AIMv2Model(AIMv2PreTrainedModel):
-    def __init__(self, config: AIMv2Config):
+class AIMv2VisionModel(AIMv2PreTrainedModel):
+    def __init__(self, config: AIMv2VisionConfig):
         super().__init__(config)
         self.config = config
-        self.embeddings = AIMv2Embeddings(config)
+        self.embeddings = AIMv2VisionEmbeddings(config)
         self.encoder = AIMv2Encoder(config)
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
@@ -416,15 +492,71 @@ class AIMv2Model(AIMv2PreTrainedModel):
         )
 
 
-# Image classification docstring
-_IMAGE_CLASS_CHECKPOINT = "facebook/aimv2-small-imagenet1k-1-layer"
-_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+class AIMv2TextModel(AIMv2PreTrainedModel):
+    def __init__(self, config: AIMv2VisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = AIMv2TextEmbeddings(config)
+        self.encoder = AIMv2Encoder(config)
+        # Here comes the eos extract class
+        self.head = nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        pixel_values,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.rms_norm(last_hidden_state)
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
+    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
+    """
+    square_tensor = torch.pow(tensor, 2)
+    sum_tensor = torch.sum(square_tensor, dim=-1, keepdim=True)
+    normed_tensor = torch.pow(sum_tensor, 0.5)
+    return normed_tensor
 
 
 AIMV2_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
 
     Parameters:
         config ([`AIMv2Config`]): Model configuration class with all the parameters of the model.
@@ -432,18 +564,28 @@ AIMV2_START_DOCSTRING = r"""
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-AIMV2_INPUTS_DOCSTRING = r"""
+AIMV2_TEXT_INPUTS_DOCSTRING = r"""
     Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
-            [`BitImageProcessor.preprocess`] for details.
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
 
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -454,105 +596,304 @@ AIMV2_INPUTS_DOCSTRING = r"""
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
+AIMV2_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`AIMv2ImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
 
-@add_start_docstrings(
-    """
-    AIMv2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
-    of the [CLS] token) e.g. for ImageNet.
-    """,
-    AIMV2_START_DOCSTRING,
-)
-class AIMv2ForImageClassification(AIMv2PreTrainedModel):
-    def __init__(self, config: AIMv2Config) -> None:
-        super().__init__(config)
+AIMV2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
 
-        self.num_labels = config.num_labels
-        self.aimv2 = AIMv2Model(config)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
 
-        # Classifier head
-        self.classifier = (
-            nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
-        )
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`AIMv2ImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(AIMV2_START_DOCSTRING)
+class AIMv2Model(AIMv2PreTrainedModel):
+    config_class = AIMv2Config
+    _no_split_modules = ["AIMv2TextEmbeddings", "AIMv2EncoderLayer", "AIMv2VisionEmbeddings"]
+
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+
+        if not isinstance(config.text_config, AIMv2VisionConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, AIMv2VisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = AIMv2TextModel._from_config(text_config)
+
+        self.vision_model = AIMv2VisionModel._from_config(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+
+        # Verify whether it's working right or not.
+        logit_scale_tensor = torch.tensor(self.config.logit_scale_init_value)
+        self.log_logit_scale = nn.Parameter(torch.log(logit_scale_tensor))
+
+        self.max_log_logit_scale = math.log(config.max_logit_scale)
 
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(AIMV2_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_IMAGE_CLASS_CHECKPOINT,
-        output_type=ImageClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
-    )
-    def forward(
+    @add_start_docstrings_to_model_forward(AIMV2_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[tuple, ImageClassifierOutput]:
+    ) -> torch.FloatTensor:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`AIMv2TextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, AIMv2Model
+
+        >>> model = AIMv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use AIMV2 model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.aimv2(
-            pixel_values,
-            head_mask=head_mask,
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
 
-        cls_token = sequence_output[:, 0]
-        patch_tokens = sequence_output[:, 1:]
+        return text_features
 
-        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+    @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`AIMv2VisionModel`].
 
-        logits = self.classifier(linear_input)
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AIMv2Model
+
+        >>> model = AIMv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use AIMV2 model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @add_start_docstrings_to_model_forward(AIMV2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=AIMv2Output, config_class=AIMv2Config)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, AIMv2Output]:
+        r"""
+        Returns:
+
+        Examples:
+        Returns:
+
+        Examples:
+
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / _get_vector_norm(image_embeds)
+        text_embeds = text_embeds / _get_vector_norm(text_embeds)
+
+        logit_scale = self.log_logit_scale.clamp(0.0, self.max_log_logit_scale).exp()
+        logits_per_text = text_embeds * logit_scale.exp().to(text_embeds.device)
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device))
+
+        logits_per_image = logits_per_text.t()
 
         loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
+        # if return_loss:
+        # Use the loss used in aimv2
+        # loss = clip_loss(logits_per_text)
 
         if not return_dict:
-            output = (logits,) + outputs[2:]
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
             return ((loss,) + output) if loss is not None else output
 
-        return ImageClassifierOutput(
+        return AIMv2Output(
             loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
         )
 
 
-__all__ = ["AIMv2Model", "AIMv2ForImageClassification"]
+__all__ = ["AIMv2VisionModel", "AIMv2Model"]
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 9be829feeb6..6727d19abda 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -15,7 +15,8 @@
 
 """Pytorch implementation of AIMv2 Model"""
 
-from typing import Callable, Optional, Tuple
+import math
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -27,22 +28,120 @@ from ...activations import ACT2FN
 from ...utils import (
     logging,
 )
-from ..dinov2.modeling_dinov2 import Dinov2ForImageClassification
+from ..clip.modeling_clip import CLIPModel, CLIPOutput, CLIPTextEmbeddings, _get_vector_norm
 from ..llama.modeling_llama import LlamaRMSNorm
+from ..siglip.configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
 from ..siglip.modeling_siglip import SiglipEncoder
-from ..vit.configuration_vit import ViTConfig
-from .configuration_aimv2 import AIMv2Config
 
 
 logger = logging.get_logger(__name__)
 
 
+class AIMv2VisionConfig(SiglipVisionConfig):
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        hidden_act="silu",
+        **kwargs,
+    ):
+        super().__init__(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            hidden_act=hidden_act,
+            num_channels=num_channels,
+            image_size=image_size,
+            patch_size=patch_size,
+            qkv_bias=qkv_bias,
+            **kwargs,
+        )
+
+        self.attention_dropout = attention_dropout
+        self.use_bias = use_bias
+        self.qkv_bias = qkv_bias
+        self.rms_norm_eps = rms_norm_eps
+        self.projection_dropout = projection_dropout
+
+        del self.layer_norm_eps
+
+
+class AIMv2TextConfig(SiglipTextConfig):
+    def __init__(
+        self,
+        vocab_size: int = 49408,
+        hidden_size: int = 768,
+        intermediate_size: int = 2048,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 6,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        pad_token_id=None,
+        bos_token_id=None,
+        eos_token_id: int = 49407,
+        max_position_embeddings: int = 77,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            max_position_embeddings=max_position_embeddings,
+            qkv_bias=qkv_bias,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+        self.projection_dropout = projection_dropout
+        self.use_bias = use_bias
+
+        del self.bos_token_id
+        del self.pad_token_id
+        del self.projection_size
+        del self.hidden_act
+        del self.layer_norm_eps
+
+
+class AIMv2Config(SiglipConfig):
+    # Modify default logit scale value accordingly with aimv2 configs
+    def __init__(self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs):
+        super().__init__(text_config, vision_config, **kwargs)
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+
+    pass
+
+
+class AIMv2Output(CLIPOutput):
+    pass
+
+
 class AIMv2RMSNorm(LlamaRMSNorm):
     pass
 
 
 class AIMv2SwiGLUFFN(nn.Module):
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         in_features = config.hidden_size
         out_features = config.intermediate_size
@@ -60,8 +159,8 @@ class AIMv2SwiGLUFFN(nn.Module):
         return hidden_states
 
 
-class AIMv2Embeddings(nn.Module):
-    def __init__(self, config: AIMv2Config):
+class AIMv2VisionEmbeddings(nn.Module):
+    def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         self.config = config
         self.patch_size = config.patch_size
@@ -107,6 +206,10 @@ class AIMv2Embeddings(nn.Module):
         return hidden_states
 
 
+class AIMv2TextEmbeddings(CLIPTextEmbeddings):
+    pass
+
+
 def eager_attention_forward(
     module: nn.Module,
     query_states: torch.Tensor,
@@ -135,7 +238,7 @@ def eager_attention_forward(
 class AIMv2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -207,7 +310,7 @@ class AIMv2Attention(nn.Module):
 
 
 class AIMv2EncoderLayer(nn.Module):
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         self.attention = AIMv2Attention(config)
         self.ffn = AIMv2SwiGLUFFN(config)
@@ -262,11 +365,11 @@ class AIMv2PreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
 
 
-class AIMv2Model(AIMv2PreTrainedModel):
-    def __init__(self, config: AIMv2Config):
+class AIMv2VisionModel(AIMv2PreTrainedModel):
+    def __init__(self, config: AIMv2VisionConfig):
         super().__init__(config)
         self.config = config
-        self.embeddings = AIMv2Embeddings(config)
+        self.embeddings = AIMv2VisionEmbeddings(config)
         self.encoder = AIMv2Encoder(config)
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
@@ -307,52 +410,185 @@ class AIMv2Model(AIMv2PreTrainedModel):
         )
 
 
-class AIMv2ForImageClassification(Dinov2ForImageClassification):
-    pass
+class AIMv2TextModel(AIMv2PreTrainedModel):
+    def __init__(self, config: AIMv2VisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = AIMv2TextEmbeddings(config)
+        self.encoder = AIMv2Encoder(config)
+        # Here comes the eos extract class
+        self.head = nn.Identity()
 
+        # Initialize weights and apply final processing
+        self.post_init()
 
-class AIMv2Config(ViTConfig):
-    def __init__(
+    def forward(
         self,
-        hidden_size: int = 1024,
-        intermediate_size: int = 2816,
-        num_hidden_layers: int = 24,
-        num_attention_heads: int = 8,
-        num_channels: int = 3,
-        image_size: int = 224,
-        patch_size: int = 14,
-        rms_norm_eps: float = 1e-5,
-        attention_dropout: float = 0.0,
-        projection_dropout: float = 0.0,
-        qkv_bias: bool = False,
-        use_bias: bool = False,
-        hidden_act="silu",
-        initializer_range=0.02,
-        **kwargs,
+        pixel_values,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ):
-        super().__init__(
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            num_hidden_layers=num_hidden_layers,
-            num_attention_heads=num_attention_heads,
-            hidden_act=hidden_act,
-            num_channels=num_channels,
-            image_size=image_size,
-            patch_size=patch_size,
-            qkv_bias=qkv_bias,
-            initializer_range=initializer_range,
-            **kwargs,
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 
-        self.attention_dropout = attention_dropout
-        self.rms_norm_eps = rms_norm_eps
-        self.projection_dropout = projection_dropout
-        self.use_bias = use_bias
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.rms_norm(last_hidden_state)
 
-        del self.attention_probs_dropout_prob
-        del self.layer_norm_eps
-        del self.encoder_stride
-        del self.hidden_dropout_prob
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
 
 
-__all__ = ["AIMv2Config", "AIMv2Model", "AIMv2ForImageClassification"]
+class AIMv2Model(CLIPModel, nn.Module):
+    def __init__(self, config: AIMv2Config):
+        nn.Module().__init__()
+
+        if not isinstance(config.text_config, AIMv2VisionConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, AIMv2VisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = AIMv2TextModel._from_config(text_config)
+
+        self.vision_model = AIMv2VisionModel._from_config(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+
+        # Verify whether it's working right or not.
+        logit_scale_tensor = torch.tensor(self.config.logit_scale_init_value)
+        self.log_logit_scale = nn.Parameter(torch.log(logit_scale_tensor))
+
+        self.max_log_logit_scale = math.log(config.max_logit_scale)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, AIMv2Output]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / _get_vector_norm(image_embeds)
+        text_embeds = text_embeds / _get_vector_norm(text_embeds)
+
+        logit_scale = self.log_logit_scale.clamp(0.0, self.max_log_logit_scale).exp()
+        logits_per_text = text_embeds * logit_scale.exp().to(text_embeds.device)
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device))
+
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        # if return_loss:
+        # Use the loss used in aimv2
+        # loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return AIMv2Output(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+__all__ = ["AIMv2Config", "AIMv2VisionConfig", "AIMv2TextConfig", "AIMv2VisionModel", "AIMv2Model"]

From 21b9231ba8a86155a3c136bc38fbf119458e589c Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 26 Mar 2025 16:19:56 +0530
Subject: [PATCH 10/62] Added vison model

---
 src/transformers/models/auto/configuration_auto.py | 1 +
 src/transformers/models/auto/modeling_auto.py      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index f7f648cf97e..8f5d489a70f 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -360,6 +360,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
         ("aimv2", "AIMv2"),
+        ("aimv2_vision_model", "AIMv2VisionModel"),
         ("albert", "ALBERT"),
         ("align", "ALIGN"),
         ("altclip", "AltCLIP"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 24fd4078846..1d9e0059bf4 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -34,6 +34,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
         ("aimv2", "AIMv2Model"),
+        ("aimv2_vision_model", "AIMv2VisionModel"),
         ("albert", "AlbertModel"),
         ("align", "AlignModel"),
         ("altclip", "AltCLIPModel"),

From f59ac3b818dcfda63f435ca09c110b16fc20b871 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 26 Mar 2025 16:20:13 +0530
Subject: [PATCH 11/62] update

---
 src/transformers/__init__.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 18b89cd8f66..712eb3684ac 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1859,9 +1859,9 @@ else:
     )
     _import_structure["models.aimv2"].extend(
         [
-            "AIMv2ForImageClassification",
             "AIMv2Model",
             "AIMv2PreTrainedModel",
+            "AIMv2VisionModel",
         ]
     )
     _import_structure["models.clipseg"].extend(
@@ -6719,13 +6719,8 @@ if TYPE_CHECKING:
         from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
         from .modeling_utils import PreTrainedModel
         from .models.aimv2 import (
-            AIMv2ForImageClassification,
             AIMv2Model,
-            AIMv2PreTrainedModel,
-            AIMv2TextModel,
-            AIMv2TextModelWithProjection,
             AIMv2VisionModel,
-            AIMv2VisionModelWithProjection,
         )
         from .models.albert import (
             AlbertForMaskedLM,

From c22841e91d562f6aa1d00edec9a79c3281ab8f95 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 26 Mar 2025 16:20:33 +0530
Subject: [PATCH 12/62] Refactor for lit variant

---
 .../convert_aimv2_original_pytorch_to_hf.py   | 101 ++++++++++++++----
 1 file changed, 79 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index 58f96fb906e..6b6a585997e 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -23,13 +23,13 @@ import torch
 from huggingface_hub import snapshot_download
 from safetensors import safe_open
 
-from transformers import AIMv2Config, AIMv2Model, AutoProcessor
+from transformers import AIMv2Config, AIMv2Model, AIMv2VisionConfig, AIMv2VisionModel, AutoProcessor
 
 
-NEW_MODEL_KEY_MAPPING = {
+ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL = {
     # Embeddings
     r"preprocessor.patchifier.proj": r"embeddings.patch_embed",
-    r"preprocessor.pos_embed": r"embeddings.position_embeddings.weight",
+    r"preprocessor.pos_embed": r"embeddings.position_embedding.weight",
     r"preprocessor.patchifier.norm.weight": r"embeddings.rms_norm.weight",
     # Encoder Layers
     r"trunk.blocks.(\d+).attn.qkv": r"encoder.layers.\1.attention.qkv",
@@ -44,6 +44,40 @@ NEW_MODEL_KEY_MAPPING = {
     r"trunk.post_trunk_norm": r"rms_norm",
 }
 
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+    # Vision Embeddings
+    r"image_encoder.preprocessor.patchifier.proj": r"vision_model.embeddings.patch_embed",
+    r"image_encoder.preprocessor.pos_embed": r"vision_model.embeddings.position_embedding.weight",
+    r"image_encoder.preprocessor.patchifier.norm.weight": r"vision_model.embeddings.rms_norm.weight",
+    # Vision Encoder Layers
+    r"image_encoder.trunk.blocks.(\d+).attn.qkv": r"vision_model.encoder.layers.\1.attention.qkv",
+    r"image_encoder.trunk.blocks.(\d+).attn.proj": r"vision_model.encoder.layers.\1.attention.proj_out",
+    r"image_encoder.trunk.blocks.(\d+).mlp.fc1": r"vision_model.encoder.layers.\1.ffn.fc1",
+    r"image_encoder.trunk.blocks.(\d+).mlp.fc2": r"vision_model.encoder.layers.\1.ffn.fc2",
+    r"image_encoder.trunk.blocks.(\d+).mlp.fc3": r"vision_model.encoder.layers.\1.ffn.fc3",
+    # Normalization Layers
+    r"image_encoder.trunk.blocks.(\d+).norm_1": r"vision_model.encoder.layers.\1.rms_norm1",
+    r"image_encoder.trunk.blocks.(\d+).norm_2": r"vision_model.encoder.layers.\1.rms_norm2",
+    r"image_encoder.trunk.post_trunk_norm": r"vision_model.rms_norm",
+    r"image_projector": r"visual_projection",
+    # Vision Head
+    r"image_encoder.head": r"vision_model.head",
+    # Text Embeddings
+    r"text_encoder.preprocessor.text_embedding.weight": r"text_model.embeddings.token_embedding.weight",
+    r"text_encoder.preprocessor.positional_embedding": r"text_model.embeddings.position_embedding.weight",
+    # Text Encoder Layers
+    r"text_encoder.trunk.blocks.(\d+).attn.qkv": r"text_model.encoder.layers.\1.attention.qkv",
+    r"text_encoder.trunk.blocks.(\d+).attn.proj": r"text_model.encoder.layers.\1.attention.proj_out",
+    r"text_encoder.trunk.blocks.(\d+).mlp.fc1": r"text_model.encoder.layers.\1.ffn.fc1",
+    r"text_encoder.trunk.blocks.(\d+).mlp.fc2": r"text_model.encoder.layers.\1.ffn.fc2",
+    r"text_encoder.trunk.blocks.(\d+).mlp.fc3": r"text_model.encoder.layers.\1.ffn.fc3",
+    # Text Normalization Layers
+    r"text_encoder.trunk.blocks.(\d+).norm_1": r"text_model.encoder.layers.\1.rms_norm1",
+    r"text_encoder.trunk.blocks.(\d+).norm_2": r"text_model.encoder.layers.\1.rms_norm2",
+    r"text_encoder.trunk.post_trunk_norm": r"text_model.rms_norm",
+    r"text_projector": r"text_projection",
+}
+
 
 def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> Dict[str, torch.Tensor]:
     # Download only the model.safetensors file
@@ -63,14 +97,14 @@ def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> D
     return original_state_dict
 
 
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
+def convert_old_keys_to_new_keys(state_dict_keys: dict, ORIGINAL_TO_CONVERTED_KEY_MAPPING: dict):
     """Converts state dict keys from the old format to the new format."""
 
     output_dict = {}
     if state_dict_keys is not None:
         old_text = "\n".join(state_dict_keys)
         new_text = old_text
-        for pattern, replacement in NEW_MODEL_KEY_MAPPING.items():
+        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
             if replacement is None:
                 new_text = re.sub(pattern, "", new_text)  # an empty line
                 continue
@@ -89,22 +123,51 @@ def split_qkv_tensor(key, tensor):
     return {key.replace("qkv", new_key): split_tensors[i] for i, new_key in enumerate(new_keys)}
 
 
+def get_model_config_mapping(model_id: str):
+    """Determines the correct model, config, and key mappings based on the checkpoint name."""
+
+    if model_id == "apple/aimv2-large-patch14-224-lit":
+        return AIMv2Model, AIMv2Config, ORIGINAL_TO_CONVERTED_KEY_MAPPING
+    else:
+        return AIMv2VisionModel, AIMv2VisionConfig, ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL
+
+
 def write_model(
     hf_repo_id: str,
     output_dir: str,
     safe_serialization: bool = True,
 ):
+    """
+    Converts a model checkpoint to Hugging Face format and saves it.
+
+    Args:
+        hf_repo_id (str): The Hugging Face repo ID to load from.
+        output_dir (str): The directory to save the converted model.
+        safe_serialization (bool): Whether to use safe serialization.
+
+    Returns:
+        model: The reloaded Hugging Face model.
+    """
     os.makedirs(output_dir, exist_ok=True)
 
-    # create config
-    config = AIMv2Config.from_pretrained(hf_repo_id)
+    # Get the appropriate model, config, and key mapping
+    model_class, config_class, key_mapping = get_model_config_mapping(hf_repo_id)
+
+    # Load config and original state dict
+    config = config_class.from_pretrained(hf_repo_id)
+
+    # Checkpoint `apple/aimv2-large-patch14-224-lit` uses AttentionPoolingHead hence set the required attr in config.
+    if hf_repo_id == "apple/aimv2-large-patch14-224-lit":
+        config.vision_config.use_head = True
 
-    # Load original model state dict
     original_state_dict = load_original_state_dict(hf_repo_id)
 
     print("Converting model...")
+
     state_dict = {}
-    result = convert_old_keys_to_new_keys(original_state_dict)
+    # For `apple/aimv2-large-patch14-native` we don't have position_embedding in state_dict
+    strict_loading = False
+    result = convert_old_keys_to_new_keys(original_state_dict, key_mapping)
     all_keys = list(original_state_dict.keys())
 
     for key in all_keys:
@@ -117,18 +180,13 @@ def write_model(
         else:
             state_dict[new_key] = value
 
-    # Check if position embeddings exist before squeezing
-    if "embeddings.position_embeddings.weight" in state_dict:
-        state_dict["embeddings.position_embeddings.weight"] = state_dict[
-            "embeddings.position_embeddings.weight"
-        ].squeeze(0)
-        strict_loading = True
-    else:
-        # For `apple/aimv2-large-patch14-native` we don't have position_embeddings in state_dict
-        strict_loading = False
+        # Check if position embeddings exist before squeezing
+        if new_key.endswith("position_embedding.weight"):
+            state_dict[new_key] = value.squeeze(0)
+            strict_loading = True
 
-    print("Loading the checkpoint in a DepthPro model.")
-    model = AIMv2Model(config)
+    print(f"Loading the checkpoint in a {model_class.__name__}.")
+    model = model_class(config)
     model.load_state_dict(state_dict, strict=strict_loading, assign=True)
     print("Checkpoint loaded successfully.")
 
@@ -139,7 +197,7 @@ def write_model(
     # Safety check: reload the converted model
     gc.collect()
     print("Reloading the model to check if it's saved correctly.")
-    model = AIMv2Model.from_pretrained(output_dir, device_map="auto")
+    model = model_class.from_pretrained(output_dir, device_map="auto")
     print("Model reloaded successfully.")
     return model
 
@@ -197,4 +255,3 @@ def main():
 if __name__ == "__main__":
     main()
 
-# python src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py.py --hf_repo_id apple/aimv2-large-patch14-224 --output_dir tmp/aimv2 --safe_serialization

From ce81e5eb2520180212b702defc63aaeeabbdeed3 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 26 Mar 2025 16:23:24 +0530
Subject: [PATCH 13/62] Added Text Model

---
 .../models/aimv2/modeling_aimv2.py            | 169 ++++++++++------
 .../models/aimv2/modular_aimv2.py             | 190 ++++++++++++------
 2 files changed, 231 insertions(+), 128 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 6d0c23937e0..acf84772b1c 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -25,13 +25,14 @@ from dataclasses import dataclass
 from typing import Any, Callable, Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 
-from transformers.modeling_outputs import BaseModelOutput
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutputWithPooling
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -132,7 +133,7 @@ class AIMv2VisionEmbeddings(nn.Module):
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         num_patches = (config.image_size // config.patch_size) ** 2
-        self.position_embeddings = nn.Embedding(num_patches, config.hidden_size)
+        self.position_embedding = nn.Embedding(num_patches, config.hidden_size)
         self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
 
     @staticmethod
@@ -162,7 +163,7 @@ class AIMv2VisionEmbeddings(nn.Module):
                 height // self.patch_size, width // self.patch_size, embed_dim=self.config.hidden_size
             )
         else:
-            pos_embed = self.position_embeddings(self.position_ids)
+            pos_embed = self.position_embedding(self.position_ids)
 
         hidden_states = hidden_states + pos_embed
         return hidden_states
@@ -213,15 +214,14 @@ def eager_attention_forward(
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
-    head_mask: Optional[torch.Tensor],
-    scaling: float,
+    attention_mask: Optional[torch.Tensor],
     dropout: float = 0.0,
     **kwargs,
 ):
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
 
-    if head_mask is not None:
-        attn_weights = attn_weights + head_mask
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
 
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
 
@@ -248,7 +248,6 @@ class AIMv2Attention(nn.Module):
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                 f" {self.num_heads})."
             )
-        self.scale = self.head_dim**-0.5
 
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
@@ -259,7 +258,7 @@ class AIMv2Attention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -290,9 +289,8 @@ class AIMv2Attention(nn.Module):
             query_states,
             key_states,
             value_states,
-            head_mask,
+            attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scale,
             is_causal=False,
             **kwargs,
         )
@@ -318,12 +316,12 @@ class AIMv2EncoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
-        attn_output, attn_wights = self.attention(
-            hidden_states=norm_hidden_states, head_mask=head_mask, output_attentions=output_attentions
+        attn_output, attn_weights = self.attention(
+            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
         )
 
         hidden_states = hidden_states + attn_output
@@ -331,7 +329,7 @@ class AIMv2EncoderLayer(nn.Module):
         mlp_output = self.ffn(norm_hidden_states)
 
         hidden_states = hidden_states + mlp_output
-        return (hidden_states, attn_wights) if output_attentions else (hidden_states, None)
+        return (hidden_states, attn_weights) if output_attentions else (hidden_states, None)
 
 
 class AIMv2Encoder(nn.Module):
@@ -422,6 +420,35 @@ class AIMv2Encoder(nn.Module):
         )
 
 
+class AIMv2AttentionPoolingHead(nn.Module):
+    def __init__(self, config: AIMv2VisionConfig):
+        super().__init__()
+        dim = config.hidden_size
+        qkv_bias = config.qkv_bias
+
+        self.num_heads = config.num_attention_heads
+
+        self.k = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias=qkv_bias)
+        self.cls_token = nn.Parameter(torch.randn(1, 1, dim) * 0.02)
+        self.linear = nn.Linear(dim, dim, bias=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        cls_token = self.cls_token.expand(B, -1, -1)
+
+        q = cls_token.reshape(B, 1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        k = self.k(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        v = self.v(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        x_cls = F.scaled_dot_product_attention(q, k, v)
+        x_cls = x_cls.transpose(1, 2).reshape(B, 1, C)
+        x_cls = x_cls.mean(dim=1)
+
+        out = self.linear(x_cls)
+        return out
+
+
 class AIMv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -436,7 +463,11 @@ class AIMv2PreTrainedModel(PreTrainedModel):
     _supports_sdpa = True
 
     def _init_weights(self, module):
-        std = self.config.initializer_range
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
         if isinstance(module, (nn.Linear, nn.Conv2d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
@@ -455,13 +486,18 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         self.encoder = AIMv2Encoder(config)
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
+        # Use attention pooling head only for lit vairant
+        self.use_head = config.use_head
+        if self.use_head:
+            self.head = AIMv2AttentionPoolingHead(config)
+
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
         self,
         pixel_values,
-        head_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -476,38 +512,44 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            head_mask=head_mask,
+            attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.rms_norm(last_hidden_state)
 
-        return BaseModelOutput(
+        if self.use_head:
+            last_hidden_state = self.head(last_hidden_state)
+
+        output = BaseModelOutput(
             last_hidden_state=last_hidden_state,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
 
+        return output if return_dict else output.to_tuple()
+
 
 class AIMv2TextModel(AIMv2PreTrainedModel):
-    def __init__(self, config: AIMv2VisionConfig):
+    def __init__(self, config: AIMv2TextConfig):
         super().__init__(config)
         self.config = config
         self.embeddings = AIMv2TextEmbeddings(config)
         self.encoder = AIMv2Encoder(config)
-        # Here comes the eos extract class
-        self.head = nn.Identity()
+        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+
+        self.eos_token_id = config.eos_token_id
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
         self,
-        pixel_values,
-        head_mask: Optional[torch.Tensor] = None,
+        input_ids,
+        attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -518,25 +560,40 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.embeddings(input_ids)
+        _, seq_len, _ = hidden_states.shape
+
+        mask_converter = AttentionMaskConverter(True)
+        attention_mask = mask_converter.to_4d(
+            attention_mask, key_value_length=seq_len, query_length=seq_len, dtype=hidden_states.dtype
+        )
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            head_mask=head_mask,
+            attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.rms_norm(last_hidden_state)
 
-        return BaseModelOutput(
+        # Get pooled output
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id).int().argmax(dim=-1),
+        ]
+
+        output = BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
 
+        return output if return_dict else output.to_tuple()
+
 
 def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
     """
@@ -659,30 +716,29 @@ class AIMv2Model(AIMv2PreTrainedModel):
     _no_split_modules = ["AIMv2TextEmbeddings", "AIMv2EncoderLayer", "AIMv2VisionEmbeddings"]
 
     def __init__(self, config: AIMv2Config):
-        super().__init__()
-
-        if not isinstance(config.text_config, AIMv2VisionConfig):
-            raise TypeError(
-                "config.text_config is expected to be of type CLIPTextConfig but is of type"
-                f" {type(config.text_config)}."
-            )
+        super().__init__(config)
 
         if not isinstance(config.vision_config, AIMv2VisionConfig):
             raise TypeError(
-                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                "config.vision_config is expected to be of type AIMv2VisionConfig but is of type"
                 f" {type(config.vision_config)}."
             )
 
-        text_config = config.text_config
+        if not isinstance(config.text_config, AIMv2TextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type AIMv2TextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
         vision_config = config.vision_config
+        text_config = config.text_config
 
         self.projection_dim = config.projection_dim
-        self.text_embed_dim = text_config.hidden_size
         self.vision_embed_dim = vision_config.hidden_size
-
-        self.text_model = AIMv2TextModel._from_config(text_config)
+        self.text_embed_dim = text_config.hidden_size
 
         self.vision_model = AIMv2VisionModel._from_config(vision_config)
+        self.text_model = AIMv2TextModel._from_config(text_config)
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
@@ -801,11 +857,9 @@ class AIMv2Model(AIMv2PreTrainedModel):
         input_ids: Optional[torch.LongTensor] = None,
         pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: bool = False,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, AIMv2Output]:
         r"""
@@ -847,23 +901,21 @@ class AIMv2Model(AIMv2PreTrainedModel):
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
         text_outputs = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
-        image_embeds = vision_outputs[1]
+        image_embeds = vision_outputs.last_hidden_state
         image_embeds = self.visual_projection(image_embeds)
 
-        text_embeds = text_outputs[1]
+        text_embeds = text_outputs.pooler_output
         text_embeds = self.text_projection(text_embeds)
 
         # normalized features
@@ -871,21 +923,12 @@ class AIMv2Model(AIMv2PreTrainedModel):
         text_embeds = text_embeds / _get_vector_norm(text_embeds)
 
         logit_scale = self.log_logit_scale.clamp(0.0, self.max_log_logit_scale).exp()
-        logits_per_text = text_embeds * logit_scale.exp().to(text_embeds.device)
-        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device))
-
+        logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
         loss = None
-        # if return_loss:
-        # Use the loss used in aimv2
-        # loss = clip_loss(logits_per_text)
 
-        if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
-            return ((loss,) + output) if loss is not None else output
-
-        return AIMv2Output(
+        output = AIMv2Output(
             loss=loss,
             logits_per_image=logits_per_image,
             logits_per_text=logits_per_text,
@@ -895,5 +938,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
             vision_model_output=vision_outputs,
         )
 
+        return output if return_dict else output.to_tuple()
+
 
 __all__ = ["AIMv2VisionModel", "AIMv2Model"]
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 6727d19abda..43aa908f48b 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -19,9 +19,11 @@ import math
 from typing import Callable, Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 
-from transformers.modeling_outputs import BaseModelOutput
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
@@ -53,6 +55,8 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         qkv_bias: bool = False,
         use_bias: bool = False,
         hidden_act="silu",
+        initializer_range=0.02,
+        use_head=True,
         **kwargs,
     ):
         super().__init__(
@@ -68,6 +72,8 @@ class AIMv2VisionConfig(SiglipVisionConfig):
             **kwargs,
         )
 
+        self.use_head = use_head
+        self.initializer_range = initializer_range
         self.attention_dropout = attention_dropout
         self.use_bias = use_bias
         self.qkv_bias = qkv_bias
@@ -90,10 +96,12 @@ class AIMv2TextConfig(SiglipTextConfig):
         projection_dropout: float = 0.0,
         qkv_bias: bool = False,
         use_bias: bool = False,
+        hidden_act="silu",
         pad_token_id=None,
         bos_token_id=None,
         eos_token_id: int = 49407,
         max_position_embeddings: int = 77,
+        initializer_range=0.02,
         **kwargs,
     ):
         super().__init__(
@@ -102,32 +110,38 @@ class AIMv2TextConfig(SiglipTextConfig):
             intermediate_size=intermediate_size,
             num_hidden_layers=num_hidden_layers,
             num_attention_heads=num_attention_heads,
+            hidden_act=hidden_act,
             max_position_embeddings=max_position_embeddings,
-            qkv_bias=qkv_bias,
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             **kwargs,
         )
 
+        self.initializer_range = initializer_range
         self.attention_dropout = attention_dropout
+        self.use_bias = use_bias
+        self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
-        self.use_bias = use_bias
 
         del self.bos_token_id
         del self.pad_token_id
         del self.projection_size
-        del self.hidden_act
         del self.layer_norm_eps
 
 
 class AIMv2Config(SiglipConfig):
-    # Modify default logit scale value accordingly with aimv2 configs
-    def __init__(self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs):
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
         super().__init__(text_config, vision_config, **kwargs)
         self.projection_dim = projection_dim
         self.logit_scale_init_value = logit_scale_init_value
+        self.max_logit_scale = 100.0
+
+        del self.initializer_factor
 
     pass
 
@@ -170,7 +184,7 @@ class AIMv2VisionEmbeddings(nn.Module):
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         num_patches = (config.image_size // config.patch_size) ** 2
-        self.position_embeddings = nn.Embedding(num_patches, config.hidden_size)
+        self.position_embedding = nn.Embedding(num_patches, config.hidden_size)
         self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
 
     @staticmethod
@@ -200,7 +214,7 @@ class AIMv2VisionEmbeddings(nn.Module):
                 height // self.patch_size, width // self.patch_size, embed_dim=self.config.hidden_size
             )
         else:
-            pos_embed = self.position_embeddings(self.position_ids)
+            pos_embed = self.position_embedding(self.position_ids)
 
         hidden_states = hidden_states + pos_embed
         return hidden_states
@@ -215,15 +229,14 @@ def eager_attention_forward(
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
-    head_mask: Optional[torch.Tensor],
-    scaling: float,
+    attention_mask: Optional[torch.Tensor],
     dropout: float = 0.0,
     **kwargs,
 ):
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
 
-    if head_mask is not None:
-        attn_weights = attn_weights + head_mask
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
 
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
 
@@ -250,7 +263,6 @@ class AIMv2Attention(nn.Module):
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                 f" {self.num_heads})."
             )
-        self.scale = self.head_dim**-0.5
 
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
@@ -261,7 +273,7 @@ class AIMv2Attention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -292,9 +304,8 @@ class AIMv2Attention(nn.Module):
             query_states,
             key_states,
             value_states,
-            head_mask,
+            attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scale,
             is_causal=False,
             **kwargs,
         )
@@ -320,12 +331,12 @@ class AIMv2EncoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        head_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
-        attn_output, attn_wights = self.attention(
-            hidden_states=norm_hidden_states, head_mask=head_mask, output_attentions=output_attentions
+        attn_output, attn_weights = self.attention(
+            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
         )
 
         hidden_states = hidden_states + attn_output
@@ -333,13 +344,42 @@ class AIMv2EncoderLayer(nn.Module):
         mlp_output = self.ffn(norm_hidden_states)
 
         hidden_states = hidden_states + mlp_output
-        return (hidden_states, attn_wights) if output_attentions else (hidden_states, None)
+        return (hidden_states, attn_weights) if output_attentions else (hidden_states, None)
 
 
 class AIMv2Encoder(SiglipEncoder):
     pass
 
 
+class AIMv2AttentionPoolingHead(nn.Module):
+    def __init__(self, config: AIMv2VisionConfig):
+        super().__init__()
+        dim = config.hidden_size
+        qkv_bias = config.qkv_bias
+
+        self.num_heads = config.num_attention_heads
+
+        self.k = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias=qkv_bias)
+        self.cls_token = nn.Parameter(torch.randn(1, 1, dim) * 0.02)
+        self.linear = nn.Linear(dim, dim, bias=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        cls_token = self.cls_token.expand(B, -1, -1)
+
+        q = cls_token.reshape(B, 1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        k = self.k(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        v = self.v(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        x_cls = F.scaled_dot_product_attention(q, k, v)
+        x_cls = x_cls.transpose(1, 2).reshape(B, 1, C)
+        x_cls = x_cls.mean(dim=1)
+
+        out = self.linear(x_cls)
+        return out
+
+
 class AIMv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -354,7 +394,11 @@ class AIMv2PreTrainedModel(PreTrainedModel):
     _supports_sdpa = True
 
     def _init_weights(self, module):
-        std = self.config.initializer_range
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
         if isinstance(module, (nn.Linear, nn.Conv2d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
@@ -373,13 +417,18 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         self.encoder = AIMv2Encoder(config)
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
+        # Use attention pooling head only for lit vairant
+        self.use_head = config.use_head
+        if self.use_head:
+            self.head = AIMv2AttentionPoolingHead(config)
+
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
         self,
         pixel_values,
-        head_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -394,38 +443,44 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            head_mask=head_mask,
+            attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.rms_norm(last_hidden_state)
 
-        return BaseModelOutput(
+        if self.use_head:
+            last_hidden_state = self.head(last_hidden_state)
+
+        output = BaseModelOutput(
             last_hidden_state=last_hidden_state,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
 
+        return output if return_dict else output.to_tuple()
+
 
 class AIMv2TextModel(AIMv2PreTrainedModel):
-    def __init__(self, config: AIMv2VisionConfig):
+    def __init__(self, config: AIMv2TextConfig):
         super().__init__(config)
         self.config = config
         self.embeddings = AIMv2TextEmbeddings(config)
         self.encoder = AIMv2Encoder(config)
-        # Here comes the eos extract class
-        self.head = nn.Identity()
+        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+
+        self.eos_token_id = config.eos_token_id
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
         self,
-        pixel_values,
-        head_mask: Optional[torch.Tensor] = None,
+        input_ids,
+        attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -436,52 +491,66 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.embeddings(input_ids)
+        _, seq_len, _ = hidden_states.shape
+
+        mask_converter = AttentionMaskConverter(True)
+        attention_mask = mask_converter.to_4d(
+            attention_mask, key_value_length=seq_len, query_length=seq_len, dtype=hidden_states.dtype
+        )
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            head_mask=head_mask,
+            attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.rms_norm(last_hidden_state)
 
-        return BaseModelOutput(
+        # Get pooled output
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id).int().argmax(dim=-1),
+        ]
+
+        output = BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
 
+        return output if return_dict else output.to_tuple()
+
 
 class AIMv2Model(CLIPModel, nn.Module):
     def __init__(self, config: AIMv2Config):
-        nn.Module().__init__()
-
-        if not isinstance(config.text_config, AIMv2VisionConfig):
-            raise TypeError(
-                "config.text_config is expected to be of type CLIPTextConfig but is of type"
-                f" {type(config.text_config)}."
-            )
+        nn.Module().__init__(config)
 
         if not isinstance(config.vision_config, AIMv2VisionConfig):
             raise TypeError(
-                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                "config.vision_config is expected to be of type AIMv2VisionConfig but is of type"
                 f" {type(config.vision_config)}."
             )
 
-        text_config = config.text_config
+        if not isinstance(config.text_config, AIMv2TextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type AIMv2TextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
         vision_config = config.vision_config
+        text_config = config.text_config
 
         self.projection_dim = config.projection_dim
-        self.text_embed_dim = text_config.hidden_size
         self.vision_embed_dim = vision_config.hidden_size
-
-        self.text_model = AIMv2TextModel._from_config(text_config)
+        self.text_embed_dim = text_config.hidden_size
 
         self.vision_model = AIMv2VisionModel._from_config(vision_config)
+        self.text_model = AIMv2TextModel._from_config(text_config)
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
@@ -500,11 +569,9 @@ class AIMv2Model(CLIPModel, nn.Module):
         input_ids: Optional[torch.LongTensor] = None,
         pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: bool = False,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, AIMv2Output]:
         r"""
@@ -542,23 +609,21 @@ class AIMv2Model(CLIPModel, nn.Module):
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
         text_outputs = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
         )
 
-        image_embeds = vision_outputs[1]
+        image_embeds = vision_outputs.last_hidden_state
         image_embeds = self.visual_projection(image_embeds)
 
-        text_embeds = text_outputs[1]
+        text_embeds = text_outputs.pooler_output
         text_embeds = self.text_projection(text_embeds)
 
         # normalized features
@@ -566,21 +631,12 @@ class AIMv2Model(CLIPModel, nn.Module):
         text_embeds = text_embeds / _get_vector_norm(text_embeds)
 
         logit_scale = self.log_logit_scale.clamp(0.0, self.max_log_logit_scale).exp()
-        logits_per_text = text_embeds * logit_scale.exp().to(text_embeds.device)
-        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device))
-
+        logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
         loss = None
-        # if return_loss:
-        # Use the loss used in aimv2
-        # loss = clip_loss(logits_per_text)
 
-        if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
-            return ((loss,) + output) if loss is not None else output
-
-        return AIMv2Output(
+        output = AIMv2Output(
             loss=loss,
             logits_per_image=logits_per_image,
             logits_per_text=logits_per_text,
@@ -590,5 +646,7 @@ class AIMv2Model(CLIPModel, nn.Module):
             vision_model_output=vision_outputs,
         )
 
+        return output if return_dict else output.to_tuple()
+
 
 __all__ = ["AIMv2Config", "AIMv2VisionConfig", "AIMv2TextConfig", "AIMv2VisionModel", "AIMv2Model"]

From 2af7afb7c406f591cb46228490574952ad914d06 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 26 Mar 2025 16:23:34 +0530
Subject: [PATCH 14/62] Minor fixes

---
 .../models/aimv2/configuration_aimv2.py       | 19 +++++++++++++------
 .../convert_aimv2_original_pytorch_to_hf.py   |  1 -
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index 17eb9129665..345e1cba466 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -19,7 +19,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -93,6 +92,8 @@ class AIMv2VisionConfig(PretrainedConfig):
         qkv_bias: bool = False,
         use_bias: bool = False,
         hidden_act="silu",
+        initializer_range=0.02,
+        use_head=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -104,9 +105,11 @@ class AIMv2VisionConfig(PretrainedConfig):
         self.num_channels = num_channels
         self.patch_size = patch_size
         self.image_size = image_size
-
         self.attention_dropout = attention_dropout
         self.hidden_act = hidden_act
+
+        self.use_head = use_head
+        self.initializer_range = initializer_range
         self.use_bias = use_bias
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
@@ -184,10 +187,12 @@ class AIMv2TextConfig(PretrainedConfig):
         projection_dropout: float = 0.0,
         qkv_bias: bool = False,
         use_bias: bool = False,
+        hidden_act="silu",
         pad_token_id=None,
         bos_token_id=None,
         eos_token_id: int = 49407,
         max_position_embeddings: int = 77,
+        initializer_range=0.02,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -198,11 +203,14 @@ class AIMv2TextConfig(PretrainedConfig):
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.max_position_embeddings = max_position_embeddings
-
+        self.hidden_act = hidden_act
         self.attention_dropout = attention_dropout
+
+        self.initializer_range = initializer_range
+        self.use_bias = use_bias
+        self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
-        self.use_bias = use_bias
 
 
 class AIMv2Config(PretrainedConfig):
@@ -265,10 +273,9 @@ class AIMv2Config(PretrainedConfig):
 
         self.text_config = AIMv2TextConfig(**text_config)
         self.vision_config = AIMv2VisionConfig(**vision_config)
-
-        self.initializer_factor = 1.0
         self.projection_dim = projection_dim
         self.logit_scale_init_value = logit_scale_init_value
+        self.max_logit_scale = 100.0
 
     @classmethod
     def from_text_vision_configs(cls, text_config: AIMv2TextConfig, vision_config: AIMv2VisionConfig, **kwargs):
diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index 6b6a585997e..18e1d635593 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -254,4 +254,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-

From 16f0b9288742b178b9185b88d3670ebf28361701 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 26 Mar 2025 16:27:36 +0530
Subject: [PATCH 15/62] nits

---
 src/transformers/models/aimv2/modeling_aimv2.py | 2 +-
 src/transformers/models/aimv2/modular_aimv2.py  | 3 +--
 src/transformers/utils/dummy_pt_objects.py      | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index acf84772b1c..67eff9d5502 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -941,4 +941,4 @@ class AIMv2Model(AIMv2PreTrainedModel):
         return output if return_dict else output.to_tuple()
 
 
-__all__ = ["AIMv2VisionModel", "AIMv2Model"]
+__all__ = ["AIMv2VisionModel", "AIMv2Model", "AIMv2PreTrainedModel"]
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 43aa908f48b..bb183d60a71 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -132,7 +132,6 @@ class AIMv2TextConfig(SiglipTextConfig):
 
 
 class AIMv2Config(SiglipConfig):
-
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
     ):
@@ -649,4 +648,4 @@ class AIMv2Model(CLIPModel, nn.Module):
         return output if return_dict else output.to_tuple()
 
 
-__all__ = ["AIMv2Config", "AIMv2VisionConfig", "AIMv2TextConfig", "AIMv2VisionModel", "AIMv2Model"]
+__all__ = ["AIMv2Config", "AIMv2VisionConfig", "AIMv2TextConfig", "AIMv2VisionModel", "AIMv2Model","AIMv2PreTrainedModel"]
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index dc70594f1fc..d8d2f067682 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -563,7 +563,7 @@ class AIMv2Model(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
-class AIMv2PreTrainedModel(metaclass=DummyObject):
+class AIMv2VisionModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):

From 544c19c9b193376e6237bbe9b2551fd94595df42 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 26 Mar 2025 18:12:20 +0530
Subject: [PATCH 16/62] update

---
 src/transformers/__init__.py                     |  7 ++-----
 .../models/aimv2/configuration_aimv2.py          |  2 +-
 src/transformers/models/aimv2/modeling_aimv2.py  |  6 +++++-
 src/transformers/models/aimv2/modular_aimv2.py   | 16 ++++++++++++++--
 .../models/auto/configuration_auto.py            |  1 +
 5 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 712eb3684ac..9e42e645c59 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1858,11 +1858,7 @@ else:
         ]
     )
     _import_structure["models.aimv2"].extend(
-        [
-            "AIMv2Model",
-            "AIMv2PreTrainedModel",
-            "AIMv2VisionModel",
-        ]
+        ["AIMv2Model", "AIMv2PreTrainedModel", "AIMv2VisionModel", "AIMv2TextModel"]
     )
     _import_structure["models.clipseg"].extend(
         [
@@ -6720,6 +6716,7 @@ if TYPE_CHECKING:
         from .modeling_utils import PreTrainedModel
         from .models.aimv2 import (
             AIMv2Model,
+            AIMv2TextModel,
             AIMv2VisionModel,
         )
         from .models.albert import (
diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index 345e1cba466..48ccdd7bb24 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -93,7 +93,7 @@ class AIMv2VisionConfig(PretrainedConfig):
         use_bias: bool = False,
         hidden_act="silu",
         initializer_range=0.02,
-        use_head=True,
+        use_head=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 67eff9d5502..8e90961e31a 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -479,6 +479,8 @@ class AIMv2PreTrainedModel(PreTrainedModel):
 
 
 class AIMv2VisionModel(AIMv2PreTrainedModel):
+    main_input_name = "pixel_values"
+
     def __init__(self, config: AIMv2VisionConfig):
         super().__init__(config)
         self.config = config
@@ -534,6 +536,8 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
 
 
 class AIMv2TextModel(AIMv2PreTrainedModel):
+    main_input_name = "input_ids"
+
     def __init__(self, config: AIMv2TextConfig):
         super().__init__(config)
         self.config = config
@@ -941,4 +945,4 @@ class AIMv2Model(AIMv2PreTrainedModel):
         return output if return_dict else output.to_tuple()
 
 
-__all__ = ["AIMv2VisionModel", "AIMv2Model", "AIMv2PreTrainedModel"]
+__all__ = ["AIMv2VisionModel", "AIMv2Model", "AIMv2PreTrainedModel", "AIMv2TextModel"]
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index bb183d60a71..e5df5d171ba 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -56,7 +56,7 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         use_bias: bool = False,
         hidden_act="silu",
         initializer_range=0.02,
-        use_head=True,
+        use_head=False,
         **kwargs,
     ):
         super().__init__(
@@ -409,6 +409,8 @@ class AIMv2PreTrainedModel(PreTrainedModel):
 
 
 class AIMv2VisionModel(AIMv2PreTrainedModel):
+    main_input_name = "pixel_values"
+
     def __init__(self, config: AIMv2VisionConfig):
         super().__init__(config)
         self.config = config
@@ -464,6 +466,8 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
 
 
 class AIMv2TextModel(AIMv2PreTrainedModel):
+    main_input_name = "input_ids"
+
     def __init__(self, config: AIMv2TextConfig):
         super().__init__(config)
         self.config = config
@@ -648,4 +652,12 @@ class AIMv2Model(CLIPModel, nn.Module):
         return output if return_dict else output.to_tuple()
 
 
-__all__ = ["AIMv2Config", "AIMv2VisionConfig", "AIMv2TextConfig", "AIMv2VisionModel", "AIMv2Model","AIMv2PreTrainedModel"]
+__all__ = [
+    "AIMv2Config",
+    "AIMv2VisionConfig",
+    "AIMv2TextConfig",
+    "AIMv2VisionModel",
+    "AIMv2Model",
+    "AIMv2PreTrainedModel",
+    "AIMv2TextModel",
+]
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 8f5d489a70f..85480d546c7 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -360,6 +360,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
         ("aimv2", "AIMv2"),
+        ("aimv2_text_model", "AIMv2TextModel"),
         ("aimv2_vision_model", "AIMv2VisionModel"),
         ("albert", "ALBERT"),
         ("align", "ALIGN"),

From 99f6e5ee0baf511783e8d9d633b5b1d35a628ef0 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 26 Mar 2025 18:12:33 +0530
Subject: [PATCH 17/62] Preliminary tests

---
 tests/models/aimv2/test_modeling_aimv2.py | 711 ++++++++++++++++++----
 1 file changed, 592 insertions(+), 119 deletions(-)

diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 8701838d635..50e0cc49c06 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,49 +14,73 @@
 # limitations under the License.
 """Testing suite for the PyTorch AIMv2 model."""
 
+import inspect
+import os
+import tempfile
 import unittest
 
-from transformers import AIMv2Config
+import numpy as np
+from parameterized import parameterized
+from pytest import mark
+
+from transformers import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
 from transformers.testing_utils import (
-    is_flaky,
+    require_flash_attn,
     require_torch,
+    require_torch_gpu,
+    require_torch_sdpa,
+    slow,
     torch_device,
 )
-from transformers.utils import is_torch_available, is_vision_available
+from transformers.utils import (
+    is_torch_available,
+    is_vision_available,
+)
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    is_flaky,
+    random_attention_mask,
+)
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
+    import torch
     from torch import nn
 
-    from transformers import AIMv2ForImageClassification, AIMv2Model
+    from transformers import (
+        AIMv2Model,
+        AIMv2TextModel,
+        AIMv2VisionModel,
+    )
 
 
 if is_vision_available():
     pass
 
 
-class AIMv2ModelTester:
+class AIMv2VisionModelTester:
     def __init__(
         self,
         parent,
-        batch_size=13,
+        batch_size=12,
         image_size=30,
         patch_size=2,
         num_channels=3,
         is_training=True,
-        use_labels=True,
         hidden_size=32,
+        projection_dim=32,
         num_hidden_layers=2,
         num_attention_heads=4,
         intermediate_size=37,
-        hidden_act="silu",
-        type_sequence_label_size=10,
+        dropout=0.1,
+        attention_dropout=0.1,
         initializer_range=0.02,
-        scope=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -64,126 +88,284 @@ class AIMv2ModelTester:
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.is_training = is_training
-        self.use_labels = use_labels
         self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.type_sequence_label_size = type_sequence_label_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
         self.initializer_range = initializer_range
-        self.scope = scope
 
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
         config = self.get_config()
 
-        return config, pixel_values, labels
+        return config, pixel_values
 
     def get_config(self):
-        return AIMv2Config(
+        return AIMv2VisionConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
             hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
             intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            is_decoder=False,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
             initializer_range=self.initializer_range,
         )
 
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = AIMv2Model(config=config)
+    def create_and_check_model(self, config, pixel_values):
+        model = AIMv2VisionModel(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        with torch.no_grad():
+            result = model(pixel_values)
 
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = AIMv2ForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = AIMv2ForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
+        config, pixel_values = config_and_inputs
         inputs_dict = {"pixel_values": pixel_values}
         return config, inputs_dict
 
 
-@require_torch
-class Dinov2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class AIMv2ModelTesterMixin(ModelTesterMixin):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Dinov2 does not use input_ids, inputs_embeds,
+    Subclass of ModelTesterMixin with methods specific to testing AIMv2 models.
+    The SDPA equivalence test is overridden here because AIMv2 models may have test/vision/text+vision inputs,
+    different output logits, and are not supposed to be used or tested with padding_side="left".
+    """
+
+    def test_sdpa_can_dispatch_composite_models(self):
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # Load the model with SDPA
+                model_sdpa = model_class.from_pretrained(tmpdirname)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                # Load model with eager attention
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    attn_implementation="eager",
+                )
+                model_eager = model_eager.eval().to(torch_device)
+
+            # SigLip has one shared cls attr for all models, so we assign both submodels heer
+            vision_attn = text_attn = "sdpa" if model._supports_sdpa else "eager"
+
+            # `None` as it is the requested one which will be assigned to each sub-config
+            # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
+            if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "text_model"):
+                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
+                self.assertTrue(model_sdpa.text_model.config._attn_implementation == text_attn)
+                self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
+                self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
+
+            self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+            self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+            for name, submodule in model_eager.named_modules():
+                class_name = submodule.__class__.__name__
+                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                    raise ValueError("The eager model should not have SDPA attention layers")
+
+            has_sdpa = False
+            for name, submodule in model_sdpa.named_modules():
+                class_name = submodule.__class__.__name__
+                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                    has_sdpa = True
+                    break
+            if not has_sdpa and model_sdpa.config.model_type != "falcon":
+                raise ValueError("The SDPA model should have SDPA attention layers")
+
+
+@require_torch
+class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as AIMv2 does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
-    test_torch_exportable = True
-
-    all_model_classes = (
-        (
-            AIMv2Model,
-            AIMv2ForImageClassification,
-        )
-        if is_torch_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": AIMv2Model, "image-classification": AIMv2ForImageClassification}
-        if is_torch_available()
-        else {}
-    )
+    all_model_classes = (AIMv2VisionModel,) if is_torch_available() else ()
     fx_compatible = True
-
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = AIMv2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AIMv2Config, has_text_modality=False, hidden_size=37)
-
-    @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.")
-    def test_initialization(self):
-        super().test_initialization()
+        self.model_tester = AIMv2VisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=AIMv2VisionConfig, has_text_modality=False, hidden_size=37
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="Dinov2 does not use inputs_embeds")
+    @unittest.skip(reason="AIMv2 does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        super().test_eager_matches_sdpa_inference(
+            torch_dtype=torch_dtype,
+            logit_keys=("last_hidden_state", "pooler_output", "image_embeds"),
+            use_attention_mask_options=(None,),
+        )
+
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        super().test_sdpa_can_dispatch_composite_models()
+
+
+class AIMv2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return AIMv2TextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = AIMv2TextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (AIMv2TextModel,) if is_torch_available() else ()
+    fx_compatible = True
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = AIMv2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AIMv2TextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip
+    def test_training(self):
+        pass
+
+    @unittest.skip
     def test_training_gradient_checkpointing(self):
         pass
 
@@ -199,58 +381,349 @@ class Dinov2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+    # @unittest.skip(reason="AIMv2 does not use inputs_embeds")
+    # def test_inputs_embeds(self):
+    #     pass
 
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
+    # @unittest.skip(reason="AIMv2TextModel has no base class and is not available in MODEL_MAPPING")
+    # def test_save_load_fast_init_from_base(self):
+    #     pass
+
+    # @unittest.skip(reason="AIMv2TextModel has no base class and is not available in MODEL_MAPPING")
+    # def test_save_load_fast_init_to_base(self):
+    #     pass
+
+    # @slow
+    # def test_model_from_pretrained(self):
+    #     model_name = "openai/AIMv2-vit-base-patch32"
+    #     model = AIMv2TextModel.from_pretrained(model_name)
+    #     self.assertIsNotNone(model)
+
+    # @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    # @require_torch_sdpa
+    # @slow
+    # @is_flaky()
+    # def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+    #     super().test_eager_matches_sdpa_inference(
+    #         torch_dtype=torch_dtype,
+    #         logit_keys=("last_hidden_state", "pooler_output", "text_embeds"),
+    #         use_attention_mask_options=(None, "right"),  # "left" is not supported for text model
+    #     )
+
+    # @require_torch_sdpa
+    # def test_sdpa_can_dispatch_composite_models(self):
+    #     super().test_sdpa_can_dispatch_composite_models()
+
+    # @require_torch_sdpa
+    # def test_sdpa_can_dispatch_on_flash(self):
+    #     self.skipTest(reason="AIMv2TextModel has two attention masks: `causal_attention_mask` and `attention_mask`")
+
+
+class AIMv2ModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = AIMv2TextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = AIMv2VisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def get_config(self):
+        return AIMv2Config.from_text_vision_configs(
+            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+        )
+
+    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+        model = AIMv2Model(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": pixel_values,
+            "return_loss": True,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (AIMv2Model,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": AIMv2Model, "image-feature-extraction": AIMv2VisionModel}
+        if is_torch_available()
+        else {}
+    )
+    fx_compatible = True
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = AIMv2ModelTester(self)
+        common_properties = ["projection_dim", "logit_scale_init_value"]
+        self.config_tester = ConfigTester(
+            self, config_class=AIMv2Config, has_text_modality=False, common_properties=common_properties
+        )
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+    def test_config(self):
+        self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="Dinov2 does not support feedforward chunking yet")
-    def test_feed_forward_chunking(self):
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
         pass
 
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
 
-# We will verify our results on an image of cute cats
-# def prepare_img():
-#     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-#     return image
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
 
-# @require_torch
-# @require_vision
-# class Dinov2ModelIntegrationTest(unittest.TestCase):
-#     @cached_property
-#     def default_image_processor(self):
-#         return AutoImageProcessor.from_pretrained("facebook/dinov2-base") if is_vision_available() else None
+    @unittest.skip(reason="AIMv2Model does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
 
-#     @slow
-#     def test_inference_no_head(self):
-#         model = Dinov2Model.from_pretrained("facebook/dinov2-base").to(torch_device)
+    # override as the `logit_scale` parameter initialization is different for AIMv2
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-#         image_processor = self.default_image_processor
-#         image = prepare_img()
-#         inputs = image_processor(image, return_tensors="pt").to(torch_device)
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initialized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
 
-#         # forward pass
-#         with torch.no_grad():
-#             outputs = model(**inputs)
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            self.skipTest(reason="test_torchscript is set to False")
 
-#         # verify the last hidden states
-#         expected_shape = torch.Size((1, 257, 768))
-#         self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
 
-#         expected_slice = torch.tensor(
-#             [[-2.2005, -0.4495, 1.0964], [-3.3959, -0.8942, -1.0315], [-2.9355, 1.1564, -0.7656]],
-#             device=torch_device,
-#         )
-#         torch.testing.assert_close(outputs.last_hidden_state[0, :3, :3], expected_slice, rtol=1e-3, atol=1e-3)
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # AIMv2 needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_load_vision_text_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save AIMv2Config and check if we can load AIMv2VisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = AIMv2VisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save AIMv2Config and check if we can load AIMv2TextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = AIMv2TextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        super().test_eager_matches_sdpa_inference(
+            torch_dtype=torch_dtype,
+            logit_keys=("logits_per_image", "logits_per_text"),
+            use_attention_mask_options=(None, "right"),  # "left" is not supported for text model
+        )
+
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        super().test_sdpa_can_dispatch_composite_models()
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_equivalence(self):
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
+                model.to(torch_device)
+
+                dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16)
+                dummy_input_ids = inputs_dict["input_ids"]
+
+                outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True)
+                outputs_fa = model_fa(
+                    pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True
+                )
+
+                self.assertTrue(
+                    torch.allclose(outputs.logits_per_image, outputs_fa.logits_per_image, atol=4e-2, rtol=4e-2),
+                    f"Image logits max diff: {torch.max(torch.abs(outputs.logits_per_image - outputs_fa.logits_per_image))}",
+                )
+                self.assertTrue(
+                    torch.allclose(outputs.logits_per_text, outputs_fa.logits_per_text, atol=4e-2, rtol=4e-2),
+                    f"Text logits max diff: {torch.max(torch.abs(outputs.logits_per_text - outputs_fa.logits_per_text))}",
+                )
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+
+                model = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="eager"
+                )
+                model.to(torch_device)
+
+                dummy_pixel_values = inputs_dict["pixel_values"].to(torch.bfloat16)
+                dummy_input_ids = inputs_dict["input_ids"]
+                dummy_pixel_mask = inputs_dict["attention_mask"]
+
+                # right padding
+                dummy_pixel_mask[:] = 1
+                dummy_pixel_mask[:, -1:] = 0
+
+                outputs = model(pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True)
+                outputs_fa = model_fa(
+                    pixel_values=dummy_pixel_values, input_ids=dummy_input_ids, output_hidden_states=True
+                )
+
+                logits_per_image_eager = outputs.logits_per_image[:, :-1]
+                logits_per_text_eager = outputs.logits_per_text[:, :-1]
+
+                logits_per_image_sdpa = outputs_fa.logits_per_image[:, :-1]
+                logits_per_text_sdpa = outputs_fa.logits_per_text[:, :-1]
+
+                self.assertTrue(
+                    torch.allclose(logits_per_image_eager, logits_per_image_sdpa, atol=4e-2, rtol=4e-2),
+                    f"Image logits max diff: {torch.max(torch.abs(logits_per_image_eager - logits_per_image_sdpa))}",
+                )
+                self.assertTrue(
+                    torch.allclose(logits_per_text_eager, logits_per_text_sdpa, atol=4e-2, rtol=4e-2),
+                    f"Text logits max diff: {torch.max(torch.abs(logits_per_text_eager - logits_per_text_sdpa))}",
+                )

From cf4a128c6d5037ba211bb1c7434b7c37db8ffd32 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 26 Mar 2025 21:57:46 +0530
Subject: [PATCH 18/62] More fixes

---
 .../models/aimv2/configuration_aimv2.py       |  2 +-
 .../models/aimv2/modeling_aimv2.py            | 50 +++++++-----
 .../models/aimv2/modular_aimv2.py             | 81 ++++++++++++++-----
 3 files changed, 90 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index 48ccdd7bb24..345e1cba466 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -93,7 +93,7 @@ class AIMv2VisionConfig(PretrainedConfig):
         use_bias: bool = False,
         hidden_act="silu",
         initializer_range=0.02,
-        use_head=False,
+        use_head=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 8e90961e31a..e4f0e42c926 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -50,8 +50,7 @@ logger = logging.get_logger(__name__)
 class AIMv2Output(ModelOutput):
     """
     Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
-            Contrastive loss for image-text similarity.
+
         logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
             The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
             similarity scores.
@@ -59,16 +58,15 @@ class AIMv2Output(ModelOutput):
             The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
             similarity scores.
         text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`AIMv2TextModel`].
+            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
         image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`AIMv2VisionModel`].
+            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
         text_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`AIMv2TextModel`].
-        vision_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`AIMv2VisionModel`].
+            The output of the [`CLIPTextModel`].
+        vision_model_output (`BaseModelOutput`):
+            The output of the [`CLIPVisionModel`].
     """
 
-    loss: Optional[torch.FloatTensor] = None
     logits_per_image: torch.FloatTensor = None
     logits_per_text: torch.FloatTensor = None
     text_embeds: torch.FloatTensor = None
@@ -457,16 +455,15 @@ class AIMv2PreTrainedModel(PreTrainedModel):
 
     config_class = AIMv2Config
     base_model_prefix = "aimv2"
-    main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _no_split_modules = ["AIMv2SwiGLUFFN"]
     _supports_sdpa = True
 
     def _init_weights(self, module):
         std = (
-            self.config.initializer_range
-            if hasattr(self.config, "initializer_range")
-            else self.config.text_config.initializer_range
+            self.config.vision_config.initializer_range
+            if hasattr(self.config, "vision_config")
+            else self.config.initializer_range
         )
         if isinstance(module, (nn.Linear, nn.Conv2d)):
             module.weight.data.normal_(mean=0.0, std=std)
@@ -496,6 +493,9 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embed
+
     def forward(
         self,
         pixel_values,
@@ -523,11 +523,13 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.rms_norm(last_hidden_state)
 
+        pooler_output = None
         if self.use_head:
-            last_hidden_state = self.head(last_hidden_state)
+            pooler_output = self.head(last_hidden_state)
 
-        output = BaseModelOutput(
+        output = BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
+            pooler_output=pooler_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
@@ -550,6 +552,12 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.embeddings.token_embedding = value
+
     def forward(
         self,
         input_ids,
@@ -567,10 +575,11 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
         hidden_states = self.embeddings(input_ids)
         _, seq_len, _ = hidden_states.shape
 
-        mask_converter = AttentionMaskConverter(True)
-        attention_mask = mask_converter.to_4d(
-            attention_mask, key_value_length=seq_len, query_length=seq_len, dtype=hidden_states.dtype
-        )
+        if attention_mask is not None:
+            mask_converter = AttentionMaskConverter(True)
+            attention_mask = mask_converter.to_4d(
+                attention_mask, key_value_length=seq_len, query_length=seq_len, dtype=hidden_states.dtype
+            )
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
@@ -916,7 +925,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
             return_dict=True,
         )
 
-        image_embeds = vision_outputs.last_hidden_state
+        image_embeds = vision_outputs.pooler_output
         image_embeds = self.visual_projection(image_embeds)
 
         text_embeds = text_outputs.pooler_output
@@ -930,10 +939,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
-        loss = None
-
         output = AIMv2Output(
-            loss=loss,
             logits_per_image=logits_per_image,
             logits_per_text=logits_per_text,
             text_embeds=text_embeds,
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index e5df5d171ba..c5dcd01bb40 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -16,21 +16,23 @@
 """Pytorch implementation of AIMv2 Model"""
 
 import math
-from typing import Callable, Optional, Tuple, Union
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 from torch import nn
 
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
 from ...utils import (
+    ModelOutput,
     logging,
 )
-from ..clip.modeling_clip import CLIPModel, CLIPOutput, CLIPTextEmbeddings, _get_vector_norm
+from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm
 from ..llama.modeling_llama import LlamaRMSNorm
 from ..siglip.configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
 from ..siglip.modeling_siglip import SiglipEncoder
@@ -56,7 +58,7 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         use_bias: bool = False,
         hidden_act="silu",
         initializer_range=0.02,
-        use_head=False,
+        use_head=True,
         **kwargs,
     ):
         super().__init__(
@@ -145,8 +147,39 @@ class AIMv2Config(SiglipConfig):
     pass
 
 
-class AIMv2Output(CLIPOutput):
-    pass
+@dataclass
+class AIMv2Output(ModelOutput):
+    """
+    Args:
+
+        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
+        text_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`CLIPTextModel`].
+        vision_model_output (`BaseModelOutput`):
+            The output of the [`CLIPVisionModel`].
+    """
+
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
 
 
 class AIMv2RMSNorm(LlamaRMSNorm):
@@ -387,16 +420,15 @@ class AIMv2PreTrainedModel(PreTrainedModel):
 
     config_class = AIMv2Config
     base_model_prefix = "aimv2"
-    main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _no_split_modules = ["AIMv2SwiGLUFFN"]
     _supports_sdpa = True
 
     def _init_weights(self, module):
         std = (
-            self.config.initializer_range
-            if hasattr(self.config, "initializer_range")
-            else self.config.text_config.initializer_range
+            self.config.vision_config.initializer_range
+            if hasattr(self.config, "vision_config")
+            else self.config.initializer_range
         )
         if isinstance(module, (nn.Linear, nn.Conv2d)):
             module.weight.data.normal_(mean=0.0, std=std)
@@ -426,6 +458,9 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embed
+
     def forward(
         self,
         pixel_values,
@@ -453,11 +488,13 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.rms_norm(last_hidden_state)
 
+        pooler_output = None
         if self.use_head:
-            last_hidden_state = self.head(last_hidden_state)
+            pooler_output = self.head(last_hidden_state)
 
-        output = BaseModelOutput(
+        output = BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
+            pooler_output=pooler_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
@@ -480,6 +517,12 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.embeddings.token_embedding = value
+
     def forward(
         self,
         input_ids,
@@ -497,10 +540,11 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
         hidden_states = self.embeddings(input_ids)
         _, seq_len, _ = hidden_states.shape
 
-        mask_converter = AttentionMaskConverter(True)
-        attention_mask = mask_converter.to_4d(
-            attention_mask, key_value_length=seq_len, query_length=seq_len, dtype=hidden_states.dtype
-        )
+        if attention_mask is not None:
+            mask_converter = AttentionMaskConverter(True)
+            attention_mask = mask_converter.to_4d(
+                attention_mask, key_value_length=seq_len, query_length=seq_len, dtype=hidden_states.dtype
+            )
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
@@ -623,7 +667,7 @@ class AIMv2Model(CLIPModel, nn.Module):
             return_dict=True,
         )
 
-        image_embeds = vision_outputs.last_hidden_state
+        image_embeds = vision_outputs.pooler_output
         image_embeds = self.visual_projection(image_embeds)
 
         text_embeds = text_outputs.pooler_output
@@ -637,10 +681,7 @@ class AIMv2Model(CLIPModel, nn.Module):
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
-        loss = None
-
         output = AIMv2Output(
-            loss=loss,
             logits_per_image=logits_per_image,
             logits_per_text=logits_per_text,
             text_embeds=text_embeds,

From be7490af523351de02d3b4de078a7a17596abe9a Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 26 Mar 2025 21:58:02 +0530
Subject: [PATCH 19/62] =?UTF-8?q?Updated=20tests=20=F0=9F=A4=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/models/aimv2/test_modeling_aimv2.py | 223 ++--------------------
 1 file changed, 20 insertions(+), 203 deletions(-)

diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 50e0cc49c06..1c65c4ca195 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -15,12 +15,10 @@
 """Testing suite for the PyTorch AIMv2 model."""
 
 import inspect
-import os
 import tempfile
 import unittest
 
 import numpy as np
-from parameterized import parameterized
 from pytest import mark
 
 from transformers import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
@@ -28,7 +26,6 @@ from transformers.testing_utils import (
     require_flash_attn,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -40,10 +37,8 @@ from transformers.utils import (
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
-    _config_zero_init,
     floats_tensor,
     ids_tensor,
-    is_flaky,
     random_attention_mask,
 )
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -80,7 +75,6 @@ class AIMv2VisionModelTester:
         intermediate_size=37,
         dropout=0.1,
         attention_dropout=0.1,
-        initializer_range=0.02,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -95,7 +89,6 @@ class AIMv2VisionModelTester:
         self.intermediate_size = intermediate_size
         self.dropout = dropout
         self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
 
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches
@@ -118,7 +111,6 @@ class AIMv2VisionModelTester:
             intermediate_size=self.intermediate_size,
             dropout=self.dropout,
             attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
         )
 
     def create_and_check_model(self, config, pixel_values):
@@ -128,8 +120,7 @@ class AIMv2VisionModelTester:
         with torch.no_grad():
             result = model(pixel_values)
 
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -155,7 +146,6 @@ class AIMv2ModelTesterMixin(ModelTesterMixin):
 
                 # Load the model with SDPA
                 model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
 
                 # Load model with eager attention
                 model_eager = model_class.from_pretrained(
@@ -164,34 +154,17 @@ class AIMv2ModelTesterMixin(ModelTesterMixin):
                 )
                 model_eager = model_eager.eval().to(torch_device)
 
-            # SigLip has one shared cls attr for all models, so we assign both submodels heer
-            vision_attn = text_attn = "sdpa" if model._supports_sdpa else "eager"
-
-            # `None` as it is the requested one which will be assigned to each sub-config
-            # Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
-            if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "text_model"):
-                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
-                self.assertTrue(model_sdpa.text_model.config._attn_implementation == text_attn)
+            if hasattr(model_sdpa, "vision_model"):
+                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa")
                 self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
+
+            if hasattr(model_sdpa, "text_model"):
+                self.assertTrue(model_sdpa.text_model.config._attn_implementation == "sdpa")
                 self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
 
             self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
             self.assertTrue(model_eager.config._attn_implementation == "eager")
 
-            for name, submodule in model_eager.named_modules():
-                class_name = submodule.__class__.__name__
-                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                    raise ValueError("The eager model should not have SDPA attention layers")
-
-            has_sdpa = False
-            for name, submodule in model_sdpa.named_modules():
-                class_name = submodule.__class__.__name__
-                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                    has_sdpa = True
-                    break
-            if not has_sdpa and model_sdpa.config.model_type != "falcon":
-                raise ValueError("The SDPA model should have SDPA attention layers")
-
 
 @require_torch
 class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
@@ -201,7 +174,7 @@ class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (AIMv2VisionModel,) if is_torch_available() else ()
-    fx_compatible = True
+    fx_compatible = False
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
@@ -244,21 +217,6 @@ class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        super().test_eager_matches_sdpa_inference(
-            torch_dtype=torch_dtype,
-            logit_keys=("last_hidden_state", "pooler_output", "image_embeds"),
-            use_attention_mask_options=(None,),
-        )
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_composite_models(self):
-        super().test_sdpa_can_dispatch_composite_models()
-
 
 class AIMv2TextModelTester:
     def __init__(
@@ -346,7 +304,7 @@ class AIMv2TextModelTester:
 @require_torch
 class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
     all_model_classes = (AIMv2TextModel,) if is_torch_available() else ()
-    fx_compatible = True
+    fx_compatible = False
     test_pruning = False
     test_head_masking = False
 
@@ -369,54 +327,17 @@ class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
     def test_training_gradient_checkpointing(self):
         pass
 
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
+    @unittest.skip(reason="This model has no Loss")
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
-    @unittest.skip(
-        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
+    @unittest.skip(reason="This model has no Loss")
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    # @unittest.skip(reason="AIMv2 does not use inputs_embeds")
-    # def test_inputs_embeds(self):
-    #     pass
-
-    # @unittest.skip(reason="AIMv2TextModel has no base class and is not available in MODEL_MAPPING")
-    # def test_save_load_fast_init_from_base(self):
-    #     pass
-
-    # @unittest.skip(reason="AIMv2TextModel has no base class and is not available in MODEL_MAPPING")
-    # def test_save_load_fast_init_to_base(self):
-    #     pass
-
-    # @slow
-    # def test_model_from_pretrained(self):
-    #     model_name = "openai/AIMv2-vit-base-patch32"
-    #     model = AIMv2TextModel.from_pretrained(model_name)
-    #     self.assertIsNotNone(model)
-
-    # @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    # @require_torch_sdpa
-    # @slow
-    # @is_flaky()
-    # def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-    #     super().test_eager_matches_sdpa_inference(
-    #         torch_dtype=torch_dtype,
-    #         logit_keys=("last_hidden_state", "pooler_output", "text_embeds"),
-    #         use_attention_mask_options=(None, "right"),  # "left" is not supported for text model
-    #     )
-
-    # @require_torch_sdpa
-    # def test_sdpa_can_dispatch_composite_models(self):
-    #     super().test_sdpa_can_dispatch_composite_models()
-
-    # @require_torch_sdpa
-    # def test_sdpa_can_dispatch_on_flash(self):
-    #     self.skipTest(reason="AIMv2TextModel has two attention masks: `causal_attention_mask` and `attention_mask`")
+    @unittest.skip(reason="AIMv2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
 
 
 class AIMv2ModelTester:
@@ -459,6 +380,10 @@ class AIMv2ModelTester:
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, input_ids, attention_mask, pixel_values = config_and_inputs
+
+        # Set use_head to True for LIT variant
+        # config.vision_config.use_head = True
+
         inputs_dict = {
             "input_ids": input_ids,
             "attention_mask": attention_mask,
@@ -470,13 +395,14 @@ class AIMv2ModelTester:
 
 @require_torch
 class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    additional_model_inputs = ["pixel_values"]
     all_model_classes = (AIMv2Model,) if is_torch_available() else ()
     pipeline_model_mapping = (
         {"feature-extraction": AIMv2Model, "image-feature-extraction": AIMv2VisionModel}
         if is_torch_available()
         else {}
     )
-    fx_compatible = True
+    fx_compatible = False
     test_head_masking = False
     test_pruning = False
     test_resize_embeddings = False
@@ -492,6 +418,7 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        print(config_and_inputs)
         self.model_tester.create_and_check_model(*config_and_inputs)
 
     def test_config(self):
@@ -513,101 +440,6 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for AIMv2
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initialized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        if not self.test_torchscript:
-            self.skipTest(reason="test_torchscript is set to False")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        configs_no_init.return_dict = False
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-
-            try:
-                input_ids = inputs_dict["input_ids"]
-                pixel_values = inputs_dict["pixel_values"]  # AIMv2 needs pixel_values
-                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
-
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-
-                try:
-                    torch.jit.save(traced_model, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
-
-                try:
-                    loaded_model = torch.jit.load(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
-
-            model.to(torch_device)
-            model.eval()
-
-            loaded_model.to(torch_device)
-            loaded_model.eval()
-
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
-
-            non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
-                    non_persistent_buffers[key] = loaded_model_state_dict[key]
-
-            loaded_model_state_dict = {
-                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-            }
-
-            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
-
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if torch.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
-
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
-
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                p2 = loaded_model_state_dict[layer_name]
-                if p1.data.ne(p2.data).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
     def test_load_vision_text_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -623,21 +455,6 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
             text_config = AIMv2TextConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        super().test_eager_matches_sdpa_inference(
-            torch_dtype=torch_dtype,
-            logit_keys=("logits_per_image", "logits_per_text"),
-            use_attention_mask_options=(None, "right"),  # "left" is not supported for text model
-        )
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_composite_models(self):
-        super().test_sdpa_can_dispatch_composite_models()
-
     @require_flash_attn
     @require_torch_gpu
     @mark.flash_attn_test

From b893bc87622bbf820358033a9f4dbf378be60a74 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 07:43:04 +0530
Subject: [PATCH 20/62] Refactor

---
 .../convert_aimv2_original_pytorch_to_hf.py   |  7 +-
 .../models/aimv2/modeling_aimv2.py            | 92 +++++++++++--------
 .../models/aimv2/modular_aimv2.py             | 87 +++++++-----------
 3 files changed, 93 insertions(+), 93 deletions(-)

diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index 18e1d635593..6efd2d023ef 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -61,7 +61,10 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
     r"image_encoder.trunk.post_trunk_norm": r"vision_model.rms_norm",
     r"image_projector": r"visual_projection",
     # Vision Head
-    r"image_encoder.head": r"vision_model.head",
+    r"image_encoder.head.cls_token": r"vision_model.head.cls_token",
+    r"image_encoder.head.k": r"vision_model.head.k_proj",
+    r"image_encoder.head.v": r"vision_model.head.v_proj",
+    r"image_encoder.head.linear": r"vision_model.head.output_proj",
     # Text Embeddings
     r"text_encoder.preprocessor.text_embedding.weight": r"text_model.embeddings.token_embedding.weight",
     r"text_encoder.preprocessor.positional_embedding": r"text_model.embeddings.position_embedding.weight",
@@ -166,7 +169,7 @@ def write_model(
 
     state_dict = {}
     # For `apple/aimv2-large-patch14-native` we don't have position_embedding in state_dict
-    strict_loading = False
+    strict_loading = True
     result = convert_old_keys_to_new_keys(original_state_dict, key_mapping)
     all_keys = list(original_state_dict.keys())
 
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index e4f0e42c926..82563ae3c3c 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -29,10 +29,11 @@ import torch.nn.functional as F
 from torch import nn
 
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -50,7 +51,6 @@ logger = logging.get_logger(__name__)
 class AIMv2Output(ModelOutput):
     """
     Args:
-
         logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
             The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
             similarity scores.
@@ -207,23 +207,37 @@ class AIMv2TextEmbeddings(nn.Module):
         return embeddings
 
 
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
 def eager_attention_forward(
     module: nn.Module,
-    query_states: torch.Tensor,
-    key_states: torch.Tensor,
-    value_states: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
     attention_mask: Optional[torch.Tensor],
+    scaling: float,
     dropout: float = 0.0,
     **kwargs,
 ):
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
 
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
     if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-
-    # Only apply attention dropout during training.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
@@ -247,6 +261,9 @@ class AIMv2Attention(nn.Module):
                 f" {self.num_heads})."
             )
 
+        self.num_key_value_groups = 1
+        self.scaling = 1.0
+
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
@@ -289,6 +306,7 @@ class AIMv2Attention(nn.Module):
             value_states,
             attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
             is_causal=False,
             **kwargs,
         )
@@ -421,30 +439,35 @@ class AIMv2Encoder(nn.Module):
 class AIMv2AttentionPoolingHead(nn.Module):
     def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
-        dim = config.hidden_size
-        qkv_bias = config.qkv_bias
-
+        self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
 
-        self.k = nn.Linear(dim, dim, bias=qkv_bias)
-        self.v = nn.Linear(dim, dim, bias=qkv_bias)
-        self.cls_token = nn.Parameter(torch.randn(1, 1, dim) * 0.02)
-        self.linear = nn.Linear(dim, dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.qkv_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.qkv_bias)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        B, N, C = x.shape
-        cls_token = self.cls_token.expand(B, -1, -1)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.hidden_size))
+        self.output_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
 
-        q = cls_token.reshape(B, 1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-        k = self.k(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-        v = self.v(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, hidden_dim = hidden_states.shape
 
-        x_cls = F.scaled_dot_product_attention(q, k, v)
-        x_cls = x_cls.transpose(1, 2).reshape(B, 1, C)
-        x_cls = x_cls.mean(dim=1)
+        cls_token = self.cls_token.expand(batch_size, -1, -1)
 
-        out = self.linear(x_cls)
-        return out
+        key = self.k_proj(hidden_states).reshape(batch_size, seq_len, self.num_heads, hidden_dim // self.num_heads)
+        value = self.v_proj(hidden_states).reshape(batch_size, seq_len, self.num_heads, hidden_dim // self.num_heads)
+        query = cls_token.reshape(batch_size, 1, self.num_heads, hidden_dim // self.num_heads)
+
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+        query = query.permute(0, 2, 1, 3)
+
+        attn_output = F.scaled_dot_product_attention(query, key, value)
+
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, 1, hidden_dim)
+        attn_output = attn_output.mean(dim=1)
+
+        output = self.output_proj(attn_output)
+        return output
 
 
 class AIMv2PreTrainedModel(PreTrainedModel):
@@ -471,8 +494,6 @@ class AIMv2PreTrainedModel(PreTrainedModel):
                 module.bias.data.zero_()
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
 
 
 class AIMv2VisionModel(AIMv2PreTrainedModel):
@@ -756,10 +777,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
 
-        # Verify whether it's working right or not.
-        logit_scale_tensor = torch.tensor(self.config.logit_scale_init_value)
-        self.log_logit_scale = nn.Parameter(torch.log(logit_scale_tensor))
-
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
         self.max_log_logit_scale = math.log(config.max_logit_scale)
 
         # Initialize weights and apply final processing
@@ -887,9 +905,9 @@ class AIMv2Model(AIMv2PreTrainedModel):
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import AutoProcessor, CLIPModel
+        >>> from transformers import AutoProcessor, AIMv2Model
 
-        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> model = AIMv2Model.from_pretrained("openai/clip-vit-base-patch32")
         >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -935,7 +953,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         image_embeds = image_embeds / _get_vector_norm(image_embeds)
         text_embeds = text_embeds / _get_vector_norm(text_embeds)
 
-        logit_scale = self.log_logit_scale.clamp(0.0, self.max_log_logit_scale).exp()
+        logit_scale = self.logit_scale.clamp(0.0, self.max_log_logit_scale).exp()
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index c5dcd01bb40..6baadf14a36 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -33,7 +33,7 @@ from ...utils import (
     logging,
 )
 from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm
-from ..llama.modeling_llama import LlamaRMSNorm
+from ..llama.modeling_llama import LlamaRMSNorm, eager_attention_forward
 from ..siglip.configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
 from ..siglip.modeling_siglip import SiglipEncoder
 
@@ -151,7 +151,6 @@ class AIMv2Config(SiglipConfig):
 class AIMv2Output(ModelOutput):
     """
     Args:
-
         logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
             The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
             similarity scores.
@@ -256,30 +255,6 @@ class AIMv2TextEmbeddings(CLIPTextEmbeddings):
     pass
 
 
-def eager_attention_forward(
-    module: nn.Module,
-    query_states: torch.Tensor,
-    key_states: torch.Tensor,
-    value_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    dropout: float = 0.0,
-    **kwargs,
-):
-    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
-
-    if attention_mask is not None:
-        attn_weights = attn_weights + attention_mask
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-
-    # Only apply attention dropout during training.
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, attn_weights
-
-
 class AIMv2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -296,6 +271,9 @@ class AIMv2Attention(nn.Module):
                 f" {self.num_heads})."
             )
 
+        self.num_key_value_groups = 1
+        self.scaling = 1.0
+
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
@@ -338,6 +316,7 @@ class AIMv2Attention(nn.Module):
             value_states,
             attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
             is_causal=False,
             **kwargs,
         )
@@ -386,30 +365,35 @@ class AIMv2Encoder(SiglipEncoder):
 class AIMv2AttentionPoolingHead(nn.Module):
     def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
-        dim = config.hidden_size
-        qkv_bias = config.qkv_bias
-
+        self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
 
-        self.k = nn.Linear(dim, dim, bias=qkv_bias)
-        self.v = nn.Linear(dim, dim, bias=qkv_bias)
-        self.cls_token = nn.Parameter(torch.randn(1, 1, dim) * 0.02)
-        self.linear = nn.Linear(dim, dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.qkv_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.qkv_bias)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        B, N, C = x.shape
-        cls_token = self.cls_token.expand(B, -1, -1)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.hidden_size))
+        self.output_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
 
-        q = cls_token.reshape(B, 1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-        k = self.k(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-        v = self.v(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, hidden_dim = hidden_states.shape
 
-        x_cls = F.scaled_dot_product_attention(q, k, v)
-        x_cls = x_cls.transpose(1, 2).reshape(B, 1, C)
-        x_cls = x_cls.mean(dim=1)
+        cls_token = self.cls_token.expand(batch_size, -1, -1)
 
-        out = self.linear(x_cls)
-        return out
+        key = self.k_proj(hidden_states).reshape(batch_size, seq_len, self.num_heads, hidden_dim // self.num_heads)
+        value = self.v_proj(hidden_states).reshape(batch_size, seq_len, self.num_heads, hidden_dim // self.num_heads)
+        query = cls_token.reshape(batch_size, 1, self.num_heads, hidden_dim // self.num_heads)
+
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+        query = query.permute(0, 2, 1, 3)
+
+        attn_output = F.scaled_dot_product_attention(query, key, value)
+
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, 1, hidden_dim)
+        attn_output = attn_output.mean(dim=1)
+
+        output = self.output_proj(attn_output)
+        return output
 
 
 class AIMv2PreTrainedModel(PreTrainedModel):
@@ -436,8 +420,6 @@ class AIMv2PreTrainedModel(PreTrainedModel):
                 module.bias.data.zero_()
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
 
 
 class AIMv2VisionModel(AIMv2PreTrainedModel):
@@ -602,11 +584,8 @@ class AIMv2Model(CLIPModel, nn.Module):
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
 
-        # Verify whether it's working right or not.
-        logit_scale_tensor = torch.tensor(self.config.logit_scale_init_value)
-        self.log_logit_scale = nn.Parameter(torch.log(logit_scale_tensor))
-
-        self.max_log_logit_scale = math.log(config.max_logit_scale)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+        self.max_logit_scale = math.log(config.max_logit_scale)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -629,9 +608,9 @@ class AIMv2Model(CLIPModel, nn.Module):
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import AutoProcessor, CLIPModel
+        >>> from transformers import AutoProcessor, AIMv2Model
 
-        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> model = AIMv2Model.from_pretrained("openai/clip-vit-base-patch32")
         >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -677,7 +656,7 @@ class AIMv2Model(CLIPModel, nn.Module):
         image_embeds = image_embeds / _get_vector_norm(image_embeds)
         text_embeds = text_embeds / _get_vector_norm(text_embeds)
 
-        logit_scale = self.log_logit_scale.clamp(0.0, self.max_log_logit_scale).exp()
+        logit_scale = self.logit_scale.clamp(0.0, self.max_logit_scale).exp()
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 

From da7bb61274be122c719db64eeddc13eb5b864d07 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 07:43:11 +0530
Subject: [PATCH 21/62] Updated testcase

---
 tests/models/aimv2/test_modeling_aimv2.py | 25 +++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 1c65c4ca195..f8fe391badf 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -37,6 +37,7 @@ from transformers.utils import (
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
+    _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
@@ -440,6 +441,30 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     def test_model_get_set_embeddings(self):
         pass
 
+    # override as the `logit_scale` parameter initialization is different for CLIP
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # check if `logit_scale` is initialized as per the original implementation
+                    if name == "logit_scale":
+                        self.assertAlmostEqual(
+                            param.data.item(),
+                            np.log(1 / 0.07),
+                            delta=1e-3,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
     def test_load_vision_text_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 

From c91ff5adb445b4b822ff475449626aa9a93fbc1c Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 08:32:23 +0530
Subject: [PATCH 22/62] Updated config

---
 .../models/aimv2/configuration_aimv2.py       |  98 ++++++-------
 .../models/aimv2/modeling_aimv2.py            |   4 +-
 .../models/aimv2/modular_aimv2.py             | 135 ++++++++++++++++++
 3 files changed, 182 insertions(+), 55 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index 345e1cba466..df9dd4e5d21 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -37,42 +37,38 @@ class AIMv2VisionConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        hidden_size (`int`, *optional*, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 2816):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer encoder.
         num_channels (`int`, *optional*, defaults to 3):
             Number of channels in the input images.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
+        patch_size (`int`, *optional*, defaults to 14):
             The size (resolution) of each patch.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-
-    Example:
-
-    ```python
-    >>> from transformers import AIMv2VisionConfig, AIMv2VisionModel
-
-    >>> # Initializing a AIMv2VisionConfig with google/aimv2-base-patch16-224 style configuration
-    >>> configuration = AIMv2VisionConfig()
-
-    >>> # Initializing a AIMv2VisionModel (with random weights) from the google/aimv2-base-patch16-224 style configuration
-    >>> model = AIMv2VisionModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for projection layer in Attention Module.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries, keys and values.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the Linear layers or Not.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the for initializing all weight matrices.
+        use_head (`str`, *optional*, defaults to `True`):
+            Whether to use Attention Pooling Head or Not.
+    """
 
     model_type = "aimv2_vision_model"
     base_config_key = "vision_config"
@@ -127,50 +123,42 @@ class AIMv2TextConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
+        vocab_size (`int`, *optional*, defaults to 49408):
             Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
             the `inputs_ids` passed when calling [`AIMv2Model`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 6):
             Number of attention heads for each attention layer in the Transformer encoder.
-        max_position_embeddings (`int`, *optional*, defaults to 64):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for projection layer in Attention Module.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries, keys and values.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the Linear layers or Not.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
         pad_token_id (`int`, *optional*, defaults to 1):
             The id of the padding token in the vocabulary.
         bos_token_id (`int`, *optional*, defaults to 49406):
             The id of the beginning-of-sequence token in the vocabulary.
         eos_token_id (`int`, *optional*, defaults to 49407):
             The id of the end-of-sequence token in the vocabulary.
-        projection_size (`int`, *optional*, defaults to `hidden_size`):
-            The size of the projection head.
-
-    Example:
-
-    ```python
-    >>> from transformers import AIMv2TextConfig, AIMv2TextModel
-
-    >>> # Initializing a AIMv2TextConfig with google/aimv2-base-patch16-224 style configuration
-    >>> configuration = AIMv2TextConfig()
-
-    >>> # Initializing a AIMv2TextModel (with random weights) from the google/aimv2-base-patch16-224 style configuration
-    >>> model = AIMv2TextModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the for initializing all weight matrices.
+    """
 
     model_type = "aimv2_text_model"
     base_config_key = "text_config"
@@ -228,6 +216,10 @@ class AIMv2Config(PretrainedConfig):
             Dictionary of configuration options used to initialize [`AIMv2TextConfig`].
         vision_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`AIMv2VisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter.
         kwargs (*optional*):
             Dictionary of keyword arguments.
 
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 82563ae3c3c..ee9228c972e 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -778,7 +778,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
 
         self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
-        self.max_log_logit_scale = math.log(config.max_logit_scale)
+        self.max_logit_scale = math.log(config.max_logit_scale)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -953,7 +953,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         image_embeds = image_embeds / _get_vector_norm(image_embeds)
         text_embeds = text_embeds / _get_vector_norm(text_embeds)
 
-        logit_scale = self.logit_scale.clamp(0.0, self.max_log_logit_scale).exp()
+        logit_scale = self.logit_scale.clamp(0.0, self.max_logit_scale).exp()
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 6baadf14a36..35fc2e00018 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -42,6 +42,49 @@ logger = logging.get_logger(__name__)
 
 
 class AIMv2VisionConfig(SiglipVisionConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
+    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
+    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2816):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for projection layer in Attention Module.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries, keys and values.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the Linear layers or Not.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the for initializing all weight matrices.
+        use_head (`str`, *optional*, defaults to `True`):
+            Whether to use Attention Pooling Head or Not.
+    """
+
     def __init__(
         self,
         hidden_size: int = 1024,
@@ -86,6 +129,53 @@ class AIMv2VisionConfig(SiglipVisionConfig):
 
 
 class AIMv2TextConfig(SiglipTextConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a
+    AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
+    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`AIMv2Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        projection_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for projection layer in Attention Module.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries, keys and values.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the Linear layers or Not.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the padding token in the vocabulary.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the vocabulary.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the for initializing all weight matrices.
+    """
+
     def __init__(
         self,
         vocab_size: int = 49408,
@@ -134,6 +224,51 @@ class AIMv2TextConfig(SiglipTextConfig):
 
 
 class AIMv2Config(SiglipConfig):
+    r"""
+    [`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to
+    instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
+    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AIMv2TextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AIMv2VisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import AIMv2Config, AIMv2Model
+
+    >>> # Initializing a AIMv2Config with google/aimv2-base-patch16-224 style configuration
+    >>> configuration = AIMv2Config()
+
+    >>> # Initializing a AIMv2Model (with random weights) from the google/aimv2-base-patch16-224 style configuration
+    >>> model = AIMv2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a AIMv2Config from a AIMv2TextConfig and a AIMv2VisionConfig
+    >>> from transformers import AIMv2TextConfig, AIMv2VisionConfig
+
+    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
+    >>> config_text = AIMv2TextConfig()
+    >>> config_vision = AIMv2VisionConfig()
+
+    >>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
     ):

From 104296a3dcdc61532b8c92a6b48e5f85333cfb99 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 08:32:32 +0530
Subject: [PATCH 23/62] make fixup

---
 src/transformers/models/auto/modeling_auto.py | 1 -
 src/transformers/utils/dummy_pt_objects.py    | 7 +++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index e91d94ac9fa..918702e5d21 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1418,7 +1418,6 @@ MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = OrderedDict(
 MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Zero Shot Image Classification mapping
-        ("aimv2", "AIMv2Model"),
         ("align", "AlignModel"),
         ("altclip", "AltCLIPModel"),
         ("blip", "BlipModel"),
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index ffcf295e417..30d38784ba3 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -570,6 +570,13 @@ class AIMv2Model(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class AIMv2TextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AIMv2VisionModel(metaclass=DummyObject):
     _backends = ["torch"]
 

From 019210c9c7b6a036742a8dfe7085fe3d37c8d8f0 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 09:08:05 +0530
Subject: [PATCH 24/62] more fixes

---
 docs/source/en/model_doc/aimv2.md                          | 5 -----
 src/transformers/__init__.py                               | 3 ---
 .../models/aimv2/convert_aimv2_original_pytorch_to_hf.py   | 4 +++-
 src/transformers/models/aimv2/modular_aimv2.py             | 7 ++-----
 4 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/docs/source/en/model_doc/aimv2.md b/docs/source/en/model_doc/aimv2.md
index 917e6d8d816..ee9abfe194f 100644
--- a/docs/source/en/model_doc/aimv2.md
+++ b/docs/source/en/model_doc/aimv2.md
@@ -50,10 +50,5 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
     - forward
 
 
-## AIMv2ForImageClassification
-
-[[autodoc]] AIMv2ForImageClassification
-    - forward
-
 </pt>
 <tf>
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 7979b236211..6972bcf8295 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -6752,15 +6752,12 @@ if TYPE_CHECKING:
             model_addition_debugger_context,
         )
         from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
-
         from .modeling_utils import AttentionInterface, PreTrainedModel
-        
         from .models.aimv2 import (
             AIMv2Model,
             AIMv2TextModel,
             AIMv2VisionModel,
         )
-
         from .models.albert import (
             AlbertForMaskedLM,
             AlbertForMultipleChoice,
diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index 6efd2d023ef..b8120375c3d 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -79,6 +79,8 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
     r"text_encoder.trunk.blocks.(\d+).norm_2": r"text_model.encoder.layers.\1.rms_norm2",
     r"text_encoder.trunk.post_trunk_norm": r"text_model.rms_norm",
     r"text_projector": r"text_projection",
+    r"log_logit_scale": r"logit_scale",
+
 }
 
 
@@ -169,7 +171,7 @@ def write_model(
 
     state_dict = {}
     # For `apple/aimv2-large-patch14-native` we don't have position_embedding in state_dict
-    strict_loading = True
+    strict_loading = False
     result = convert_old_keys_to_new_keys(original_state_dict, key_mapping)
     all_keys = list(original_state_dict.keys())
 
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 35fc2e00018..d2e94c0c7f0 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -83,8 +83,7 @@ class AIMv2VisionConfig(SiglipVisionConfig):
             The standard deviation of the for initializing all weight matrices.
         use_head (`str`, *optional*, defaults to `True`):
             Whether to use Attention Pooling Head or Not.
-    """
-
+        """
     def __init__(
         self,
         hidden_size: int = 1024,
@@ -174,8 +173,7 @@ class AIMv2TextConfig(SiglipTextConfig):
             just in case (e.g., 512 or 1024 or 2048).
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the for initializing all weight matrices.
-    """
-
+        """
     def __init__(
         self,
         vocab_size: int = 49408,
@@ -268,7 +266,6 @@ class AIMv2Config(SiglipConfig):
 
     >>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
     ```"""
-
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
     ):

From a031299d13685b881a5c47784314e59110ae04e1 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 11:00:57 +0530
Subject: [PATCH 25/62] Bug fix and updates

---
 .../models/aimv2/configuration_aimv2.py       | 18 ++++++++-----
 .../models/aimv2/modeling_aimv2.py            |  8 +++---
 .../models/aimv2/modular_aimv2.py             | 26 ++++++++++++-------
 3 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index df9dd4e5d21..9d370a5853e 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -87,9 +87,10 @@ class AIMv2VisionConfig(PretrainedConfig):
         projection_dropout: float = 0.0,
         qkv_bias: bool = False,
         use_bias: bool = False,
-        hidden_act="silu",
-        initializer_range=0.02,
-        use_head=True,
+        hidden_act: str = "silu",
+        initializer_range: float = 0.02,
+        use_head: bool = True,
+        is_causal: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -110,6 +111,7 @@ class AIMv2VisionConfig(PretrainedConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
+        self.is_causal = is_causal
 
 
 class AIMv2TextConfig(PretrainedConfig):
@@ -175,12 +177,13 @@ class AIMv2TextConfig(PretrainedConfig):
         projection_dropout: float = 0.0,
         qkv_bias: bool = False,
         use_bias: bool = False,
-        hidden_act="silu",
-        pad_token_id=None,
-        bos_token_id=None,
+        hidden_act: str = "silu",
+        pad_token_id: int = None,
+        bos_token_id: int = None,
         eos_token_id: int = 49407,
         max_position_embeddings: int = 77,
-        initializer_range=0.02,
+        initializer_range: bool = 0.02,
+        is_causal: bool = True,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -199,6 +202,7 @@ class AIMv2TextConfig(PretrainedConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
+        self.is_causal = is_causal
 
 
 class AIMv2Config(PretrainedConfig):
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index ee9228c972e..d46de33ea56 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -248,7 +248,7 @@ def eager_attention_forward(
 class AIMv2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: AIMv2VisionConfig):
+    def __init__(self, config):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -262,7 +262,7 @@ class AIMv2Attention(nn.Module):
             )
 
         self.num_key_value_groups = 1
-        self.scaling = 1.0
+        self.scaling = self.head_dim**-0.5
 
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
@@ -270,6 +270,8 @@ class AIMv2Attention(nn.Module):
         self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.proj_drop = nn.Dropout(config.projection_dropout)
 
+        self.is_causal = config.is_causal
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -307,7 +309,7 @@ class AIMv2Attention(nn.Module):
             attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
             scaling=self.scaling,
-            is_causal=False,
+            is_causal=self.is_causal,
             **kwargs,
         )
 
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index d2e94c0c7f0..03f3770100e 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -98,9 +98,10 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         projection_dropout: float = 0.0,
         qkv_bias: bool = False,
         use_bias: bool = False,
-        hidden_act="silu",
-        initializer_range=0.02,
-        use_head=True,
+        hidden_act: str ="silu",
+        initializer_range: float =0.02,
+        use_head: bool =True,
+        is_causal: bool = False,
         **kwargs,
     ):
         super().__init__(
@@ -123,6 +124,7 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
+        self.is_causal=is_causal
 
         del self.layer_norm_eps
 
@@ -186,12 +188,13 @@ class AIMv2TextConfig(SiglipTextConfig):
         projection_dropout: float = 0.0,
         qkv_bias: bool = False,
         use_bias: bool = False,
-        hidden_act="silu",
-        pad_token_id=None,
-        bos_token_id=None,
+        hidden_act: str="silu",
+        pad_token_id: int=None,
+        bos_token_id: int=None,
         eos_token_id: int = 49407,
         max_position_embeddings: int = 77,
-        initializer_range=0.02,
+        initializer_range: bool=0.02,
+        is_causal: bool = True,
         **kwargs,
     ):
         super().__init__(
@@ -214,6 +217,7 @@ class AIMv2TextConfig(SiglipTextConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
+        self.is_causal=is_causal
 
         del self.bos_token_id
         del self.pad_token_id
@@ -390,7 +394,7 @@ class AIMv2TextEmbeddings(CLIPTextEmbeddings):
 class AIMv2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: AIMv2VisionConfig):
+    def __init__(self, config):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -404,7 +408,7 @@ class AIMv2Attention(nn.Module):
             )
 
         self.num_key_value_groups = 1
-        self.scaling = 1.0
+        self.scaling = self.head_dim ** -0.5
 
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
@@ -412,6 +416,8 @@ class AIMv2Attention(nn.Module):
         self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.proj_drop = nn.Dropout(config.projection_dropout)
 
+        self.is_causal = config.is_causal
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -449,7 +455,7 @@ class AIMv2Attention(nn.Module):
             attention_mask,
             dropout=0.0 if not self.training else self.attention_dropout,
             scaling=self.scaling,
-            is_causal=False,
+            is_causal=self.is_causal,
             **kwargs,
         )
 

From 923f76f2dfaeca89ab3871c5dc58bdd949498270 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 12:54:59 +0530
Subject: [PATCH 26/62] deadcode

---
 src/transformers/models/auto/feature_extraction_auto.py | 1 -
 src/transformers/models/auto/modeling_auto.py           | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 250838452bf..0b8b38bc347 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -39,7 +39,6 @@ logger = logging.get_logger(__name__)
 
 FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
     [
-        ("aimv2", "AIMv2FeatureExtractor"),
         ("audio-spectrogram-transformer", "ASTFeatureExtractor"),
         ("beit", "BeitFeatureExtractor"),
         ("chinese_clip", "ChineseCLIPFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 918702e5d21..dbb9f17b79e 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -684,7 +684,6 @@ MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES = OrderedDict(
 MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Image Classification mapping
-        ("aimv2", "AIMv2ForImageClassification"),
         ("beit", "BeitForImageClassification"),
         ("bit", "BitForImageClassification"),
         ("clip", "CLIPForImageClassification"),

From 3c2d124a77bed8f914d5d2bf610a741dae7944b4 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 12:55:59 +0530
Subject: [PATCH 27/62] Fixes

---
 .../convert_aimv2_original_pytorch_to_hf.py   |   5 +-
 .../models/aimv2/modeling_aimv2.py            |   6 +-
 .../models/aimv2/modular_aimv2.py             |  33 +++---
 tests/models/aimv2/test_modeling_aimv2.py     | 108 +++++++++++++++++-
 utils/check_repo.py                           |   1 +
 5 files changed, 134 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index b8120375c3d..ce9e2540b13 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -80,7 +80,6 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
     r"text_encoder.trunk.post_trunk_norm": r"text_model.rms_norm",
     r"text_projector": r"text_projection",
     r"log_logit_scale": r"logit_scale",
-
 }
 
 
@@ -162,8 +161,8 @@ def write_model(
     config = config_class.from_pretrained(hf_repo_id)
 
     # Checkpoint `apple/aimv2-large-patch14-224-lit` uses AttentionPoolingHead hence set the required attr in config.
-    if hf_repo_id == "apple/aimv2-large-patch14-224-lit":
-        config.vision_config.use_head = True
+    if hf_repo_id != "apple/aimv2-large-patch14-224-lit":
+        config.use_head = False
 
     original_state_dict = load_original_state_dict(hf_repo_id)
 
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index d46de33ea56..76c77c28ffa 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -158,7 +158,11 @@ class AIMv2VisionEmbeddings(nn.Module):
 
         if self.config.image_size != height or self.config.image_size != width:
             pos_embed = self.build_2d_sincos_position_embedding(
-                height // self.patch_size, width // self.patch_size, embed_dim=self.config.hidden_size
+                height // self.patch_size,
+                width // self.patch_size,
+                embed_dim=self.config.hidden_size,
+                device=hidden_states.device,
+                dtype=hidden_states.dtype,
             )
         else:
             pos_embed = self.position_embedding(self.position_ids)
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 03f3770100e..9086f0aaa5f 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -83,7 +83,8 @@ class AIMv2VisionConfig(SiglipVisionConfig):
             The standard deviation of the for initializing all weight matrices.
         use_head (`str`, *optional*, defaults to `True`):
             Whether to use Attention Pooling Head or Not.
-        """
+    """
+
     def __init__(
         self,
         hidden_size: int = 1024,
@@ -98,9 +99,9 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         projection_dropout: float = 0.0,
         qkv_bias: bool = False,
         use_bias: bool = False,
-        hidden_act: str ="silu",
-        initializer_range: float =0.02,
-        use_head: bool =True,
+        hidden_act: str = "silu",
+        initializer_range: float = 0.02,
+        use_head: bool = True,
         is_causal: bool = False,
         **kwargs,
     ):
@@ -124,7 +125,7 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
-        self.is_causal=is_causal
+        self.is_causal = is_causal
 
         del self.layer_norm_eps
 
@@ -175,7 +176,8 @@ class AIMv2TextConfig(SiglipTextConfig):
             just in case (e.g., 512 or 1024 or 2048).
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the for initializing all weight matrices.
-        """
+    """
+
     def __init__(
         self,
         vocab_size: int = 49408,
@@ -188,12 +190,12 @@ class AIMv2TextConfig(SiglipTextConfig):
         projection_dropout: float = 0.0,
         qkv_bias: bool = False,
         use_bias: bool = False,
-        hidden_act: str="silu",
-        pad_token_id: int=None,
-        bos_token_id: int=None,
+        hidden_act: str = "silu",
+        pad_token_id: int = None,
+        bos_token_id: int = None,
         eos_token_id: int = 49407,
         max_position_embeddings: int = 77,
-        initializer_range: bool=0.02,
+        initializer_range: bool = 0.02,
         is_causal: bool = True,
         **kwargs,
     ):
@@ -217,7 +219,7 @@ class AIMv2TextConfig(SiglipTextConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
-        self.is_causal=is_causal
+        self.is_causal = is_causal
 
         del self.bos_token_id
         del self.pad_token_id
@@ -270,6 +272,7 @@ class AIMv2Config(SiglipConfig):
 
     >>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
     ```"""
+
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
     ):
@@ -378,7 +381,11 @@ class AIMv2VisionEmbeddings(nn.Module):
 
         if self.config.image_size != height or self.config.image_size != width:
             pos_embed = self.build_2d_sincos_position_embedding(
-                height // self.patch_size, width // self.patch_size, embed_dim=self.config.hidden_size
+                height // self.patch_size,
+                width // self.patch_size,
+                embed_dim=self.config.hidden_size,
+                device=hidden_states.device,
+                dtype=hidden_states.dtype,
             )
         else:
             pos_embed = self.position_embedding(self.position_ids)
@@ -408,7 +415,7 @@ class AIMv2Attention(nn.Module):
             )
 
         self.num_key_value_groups = 1
-        self.scaling = self.head_dim ** -0.5
+        self.scaling = self.head_dim**-0.5
 
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index f8fe391badf..4d1639faf0e 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -19,6 +19,7 @@ import tempfile
 import unittest
 
 import numpy as np
+import requests
 from pytest import mark
 
 from transformers import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
@@ -26,6 +27,7 @@ from transformers.testing_utils import (
     require_flash_attn,
     require_torch,
     require_torch_gpu,
+    require_vision,
     slow,
     torch_device,
 )
@@ -57,7 +59,9 @@ if is_torch_available():
 
 
 if is_vision_available():
-    pass
+    from PIL import Image
+
+    from transformers import AutoImageProcessor, AutoProcessor
 
 
 class AIMv2VisionModelTester:
@@ -441,7 +445,7 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initialization is different for CLIP
+    # Override as the `logit_scale` parameter initialization is different for AIMv2
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -569,3 +573,103 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
                     torch.allclose(logits_per_text_eager, logits_per_text_sdpa, atol=4e-2, rtol=4e-2),
                     f"Text logits max diff: {torch.max(torch.abs(logits_per_text_eager - logits_per_text_sdpa))}",
                 )
+
+
+@require_vision
+@require_torch
+class AIMv2ModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "yaswanthgali/aimv2-large-patch14-224-lit-HF"
+        model = AIMv2Model.from_pretrained(model_name, device_map="auto")
+        processor = AutoProcessor.from_pretrained(model_name)
+
+        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt"
+        ).to(model.device)
+
+        # Forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # Verify the logits
+        self.assertEqual(
+            outputs.logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            outputs.logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+
+        # handle device
+        expected_logits = torch.tensor([[34.2415, 24.6724]]).to(model.device)
+        self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
+
+
+@require_vision
+class AIMv2VisionModelIntegrationTests(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "yaswanthgali/aimv2-large-patch14-224-HF"
+
+        model = AIMv2VisionModel.from_pretrained(model_name, device_map="auto")
+        processor = AutoImageProcessor.from_pretrained(model_name)
+
+        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+        inputs = processor(image, return_tensors="pt").to(model.device)
+
+        with torch.no_grad():
+            output = model(**inputs)
+
+        # Verify logits shape
+        self.assertEqual(output.last_hidden_state.shape, torch.Size([1, 256, 1024]))
+
+        # Verify logits slice
+        # fmt: off
+        expected_logits = torch.tensor(
+        [[ 0.0510,  0.0806, -0.0990, -0.0154],
+        [ 2.7850, -2.5143, -0.3320,  2.4196],
+        [ 2.8179, -2.4089, -0.2770,  2.3218],
+        [ 2.7641, -2.4114, -0.3684,  2.2998],
+        [ 2.7972, -2.3180, -0.4490,  2.2302],
+        [ 2.8584, -2.5322, -0.2302,  2.4936],
+        [-2.7849,  2.4121,  1.3670, -1.5514]]).to(model.device)
+        # fmt: on
+
+        output_slice = output.last_hidden_state.squeeze(0)[0:7, 0:4]
+        self.assertTrue(torch.allclose(output_slice, expected_logits, atol=1e-3))
+
+    @slow
+    def test_inference_for_native_resolution(self):
+        model_name = "yaswanthgali/aimv2-large-patch14-native-HF"
+
+        model = AIMv2VisionModel.from_pretrained(model_name, device_map="auto")
+        processor = AutoImageProcessor.from_pretrained(model_name)
+
+        image = image = Image.open(
+            requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
+        )
+        inputs = processor(image, return_tensors="pt").to(model.device)
+
+        with torch.no_grad():
+            output = model(**inputs)
+
+        # Verify logits shape
+        self.assertEqual(output.last_hidden_state.shape, torch.Size([1, 1530, 1024]))
+
+        # Verify logits slice
+        # fmt: off
+        expected_logits = torch.tensor(
+        [[-1.3342,  0.3720,  0.0963,  0.4159],
+        [-1.5328,  0.4677,  0.0936,  0.4321],
+        [-0.3775, -0.2758, -0.0803, -0.5367],
+        [-1.3877,  0.5561, -1.9064, -1.1766],
+        [-0.5148,  0.0108, -0.4515, -0.6402],
+        [-0.3400, -0.1711, -0.1855, -0.4219],
+        [-1.2877, -0.0585, -0.1646,  0.7420]]).to(model.device)
+        # fmt: on
+
+        output_slice = output.last_hidden_state.squeeze(0)[0:7, 0:4]
+        self.assertTrue(torch.allclose(output_slice, expected_logits, atol=1e-3))
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 488754d2e86..922f7a57d24 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -176,6 +176,7 @@ TEST_FILES_WITH_NO_COMMON_TESTS = [
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
+    "AIMv2TextModel",
     "AlignTextModel",
     "AlignVisionModel",
     "ClapTextModel",

From 4d9b396828d61df94061d31abdfe4c41bb0bcf5d Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 12:56:27 +0530
Subject: [PATCH 28/62] nit

---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index adc9ff2ea29..4dd3cc6d3d7 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1071,4 +1071,4 @@
     - local: internal/time_series_utils
       title: Utilities for Time Series
     title: Internal helpers
-  title: API
\ No newline at end of file
+  title: API

From 12b206732c0f2271c914e2f197d616f73338307a Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 12:56:51 +0530
Subject: [PATCH 29/62] up

---
 docs/source/en/model_doc/aimv2.md | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/aimv2.md b/docs/source/en/model_doc/aimv2.md
index ee9abfe194f..c6c35e0d12b 100644
--- a/docs/source/en/model_doc/aimv2.md
+++ b/docs/source/en/model_doc/aimv2.md
@@ -36,19 +36,29 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 ## AIMv2Config
 
 [[autodoc]] AIMv2Config
-    - from_text_vision_configs
 
 ## AIMv2TextConfig
 
+[[autodoc]] AIMv2TextConfig
+
+## AIMv2VisionConfig
+
+[[autodoc]] AIMv2VisionConfig
+
+## AIMv2Model
 
 [[autodoc]] AIMv2Model
     - forward
 
+## AIMv2VisionModel
+
+[[autodoc]] AIMv2VisionModel
+    - forward
+
 ## AIMv2TextModel
 
 [[autodoc]] AIMv2TextModel
     - forward
 
-
 </pt>
 <tf>

From ebd1c9c8141bf89af56eed8df1fb4d04448753df Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 15:10:53 +0530
Subject: [PATCH 30/62] =?UTF-8?q?Happy=20CI=20=E2=9C=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/transformers/__init__.py                  |  1 +
 .../models/aimv2/configuration_aimv2.py       | 29 +++++++++++++++----
 .../models/aimv2/modular_aimv2.py             | 29 +++++++++++++++----
 .../models/auto/configuration_auto.py         |  3 +-
 src/transformers/models/auto/modeling_auto.py |  1 +
 src/transformers/utils/dummy_pt_objects.py    |  7 +++++
 6 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6972bcf8295..6b8c4a9f646 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -6755,6 +6755,7 @@ if TYPE_CHECKING:
         from .modeling_utils import AttentionInterface, PreTrainedModel
         from .models.aimv2 import (
             AIMv2Model,
+            AIMv2PreTrainedModel,
             AIMv2TextModel,
             AIMv2VisionModel,
         )
diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index 9d370a5853e..66f858ab0e2 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -31,7 +31,7 @@ class AIMv2VisionConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
     AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
-    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -68,7 +68,22 @@ class AIMv2VisionConfig(PretrainedConfig):
             The standard deviation of the for initializing all weight matrices.
         use_head (`str`, *optional*, defaults to `True`):
             Whether to use Attention Pooling Head or Not.
-    """
+        is_causal (`bool`, *optional*, defaults to `False`):
+            Whether to apply causal masking in scaled dot-product attention.
+    Example:
+
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+
+    >>> # Initializing a AIMv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
+    >>> configuration = AIMv2VisionConfig()
+
+    >>> # Initializing a AIMv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
+    >>> model = AIMv2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "aimv2_vision_model"
     base_config_key = "vision_config"
@@ -160,6 +175,8 @@ class AIMv2TextConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the for initializing all weight matrices.
+        is_causal (`bool`, *optional*, defaults to `True`):
+            Whether to apply causal masking in scaled dot-product attention.
     """
 
     model_type = "aimv2_text_model"
@@ -210,7 +227,7 @@ class AIMv2Config(PretrainedConfig):
     [`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to
     instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
-    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -232,10 +249,10 @@ class AIMv2Config(PretrainedConfig):
     ```python
     >>> from transformers import AIMv2Config, AIMv2Model
 
-    >>> # Initializing a AIMv2Config with google/aimv2-base-patch16-224 style configuration
+    >>> # Initializing a AIMv2Config with apple/aimv2-large-patch14-224-lit style configuration
     >>> configuration = AIMv2Config()
 
-    >>> # Initializing a AIMv2Model (with random weights) from the google/aimv2-base-patch16-224 style configuration
+    >>> # Initializing a AIMv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
     >>> model = AIMv2Model(configuration)
 
     >>> # Accessing the model configuration
@@ -248,7 +265,7 @@ class AIMv2Config(PretrainedConfig):
     >>> config_text = AIMv2TextConfig()
     >>> config_vision = AIMv2VisionConfig()
 
-    >>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
+    >>> config = AIMv2Config(text_config=config_text, vision_config=config_vision)
     ```"""
 
     model_type = "aimv2"
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 9086f0aaa5f..bb57e01f101 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -46,7 +46,7 @@ class AIMv2VisionConfig(SiglipVisionConfig):
     This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
     AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
-    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -83,7 +83,22 @@ class AIMv2VisionConfig(SiglipVisionConfig):
             The standard deviation of the for initializing all weight matrices.
         use_head (`str`, *optional*, defaults to `True`):
             Whether to use Attention Pooling Head or Not.
-    """
+        is_causal (`bool`, *optional*, defaults to `False`):
+            Whether to apply causal masking in scaled dot-product attention.
+    Example:
+
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+
+    >>> # Initializing a AIMv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
+    >>> configuration = AIMv2VisionConfig()
+
+    >>> # Initializing a AIMv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
+    >>> model = AIMv2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     def __init__(
         self,
@@ -176,6 +191,8 @@ class AIMv2TextConfig(SiglipTextConfig):
             just in case (e.g., 512 or 1024 or 2048).
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the for initializing all weight matrices.
+        is_causal (`bool`, *optional*, defaults to `True`):
+            Whether to apply causal masking in scaled dot-product attention.
     """
 
     def __init__(
@@ -232,7 +249,7 @@ class AIMv2Config(SiglipConfig):
     [`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to
     instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
-    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -254,10 +271,10 @@ class AIMv2Config(SiglipConfig):
     ```python
     >>> from transformers import AIMv2Config, AIMv2Model
 
-    >>> # Initializing a AIMv2Config with google/aimv2-base-patch16-224 style configuration
+    >>> # Initializing a AIMv2Config with apple/aimv2-large-patch14-224-lit style configuration
     >>> configuration = AIMv2Config()
 
-    >>> # Initializing a AIMv2Model (with random weights) from the google/aimv2-base-patch16-224 style configuration
+    >>> # Initializing a AIMv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
     >>> model = AIMv2Model(configuration)
 
     >>> # Accessing the model configuration
@@ -270,7 +287,7 @@ class AIMv2Config(SiglipConfig):
     >>> config_text = AIMv2TextConfig()
     >>> config_vision = AIMv2VisionConfig()
 
-    >>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
+    >>> config = AIMv2Config(text_config=config_text, vision_config=config_vision)
     ```"""
 
     def __init__(
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index e911b30f096..364b3b66440 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -33,6 +33,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
         ("aimv2", "AIMv2Config"),
+        ("aimv2_vision_model", "AIMv2VisionConfig"),
         ("albert", "AlbertConfig"),
         ("align", "AlignConfig"),
         ("altclip", "AltCLIPConfig"),
@@ -362,7 +363,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
         ("aimv2", "AIMv2"),
-        ("aimv2_text_model", "AIMv2TextModel"),
         ("aimv2_vision_model", "AIMv2VisionModel"),
         ("albert", "ALBERT"),
         ("align", "ALIGN"),
@@ -769,6 +769,7 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
         ("gemma3_text", "gemma3"),
         ("idefics3_vision", "idefics3"),
         ("siglip_vision_model", "siglip"),
+        ("aimv2_vision_model", "aimv2"),
         ("smolvlm_vision", "smolvlm"),
         ("chinese_clip_vision_model", "chinese_clip"),
         ("rt_detr_resnet", "rt_detr"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index dbb9f17b79e..dd0be0237ea 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -607,6 +607,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
 MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
     [
         # Model for Image mapping
+        ("aimv2_vision_model", "AIMv2VisionModel"),
         ("beit", "BeitModel"),
         ("bit", "BitModel"),
         ("conditional_detr", "ConditionalDetrModel"),
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 30d38784ba3..c82d49a56ad 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -570,6 +570,13 @@ class AIMv2Model(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class AIMv2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AIMv2TextModel(metaclass=DummyObject):
     _backends = ["torch"]
 

From ff5f4d82eefd03ad082b850b4e8569bf8c566feb Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 15:26:40 +0530
Subject: [PATCH 31/62] Reduce LOC

---
 .../models/aimv2/modeling_aimv2.py            | 43 +++++--------------
 .../models/aimv2/modular_aimv2.py             | 43 +++++--------------
 2 files changed, 22 insertions(+), 64 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 76c77c28ffa..618476b7a0d 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -58,13 +58,13 @@ class AIMv2Output(ModelOutput):
             The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
             similarity scores.
         text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
+            The text embeddings obtained by applying the projection layer to the pooled output of [`AIMv2TextModel`].
         image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
+            The image embeddings obtained by applying the projection layer to the pooled output of [`AIMv2VisionModel`].
         text_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`CLIPTextModel`].
+            The output of the [`AIMv2TextModel`].
         vision_model_output (`BaseModelOutput`):
-            The output of the [`CLIPVisionModel`].
+            The output of the [`AIMv2VisionModel`].
     """
 
     logits_per_image: torch.FloatTensor = None
@@ -512,12 +512,10 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         self.encoder = AIMv2Encoder(config)
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
-        # Use attention pooling head only for lit vairant
         self.use_head = config.use_head
         if self.use_head:
             self.head = AIMv2AttentionPoolingHead(config)
 
-        # Initialize weights and apply final processing
         self.post_init()
 
     def get_input_embeddings(self) -> nn.Module:
@@ -550,9 +548,7 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.rms_norm(last_hidden_state)
 
-        pooler_output = None
-        if self.use_head:
-            pooler_output = self.head(last_hidden_state)
+        pooler_output = self.head(last_hidden_state) if self.use_head else None
 
         output = BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
@@ -576,7 +572,6 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
 
         self.eos_token_id = config.eos_token_id
 
-        # Initialize weights and apply final processing
         self.post_init()
 
     def get_input_embeddings(self) -> nn.Module:
@@ -758,27 +753,12 @@ class AIMv2Model(AIMv2PreTrainedModel):
     def __init__(self, config: AIMv2Config):
         super().__init__(config)
 
-        if not isinstance(config.vision_config, AIMv2VisionConfig):
-            raise TypeError(
-                "config.vision_config is expected to be of type AIMv2VisionConfig but is of type"
-                f" {type(config.vision_config)}."
-            )
-
-        if not isinstance(config.text_config, AIMv2TextConfig):
-            raise TypeError(
-                "config.text_config is expected to be of type AIMv2TextConfig but is of type"
-                f" {type(config.text_config)}."
-            )
-
-        vision_config = config.vision_config
-        text_config = config.text_config
-
         self.projection_dim = config.projection_dim
-        self.vision_embed_dim = vision_config.hidden_size
-        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = config.vision_config.hidden_size
+        self.text_embed_dim = config.text_config.hidden_size
 
-        self.vision_model = AIMv2VisionModel._from_config(vision_config)
-        self.text_model = AIMv2TextModel._from_config(text_config)
+        self.vision_model = AIMv2VisionModel._from_config(config.vision_config)
+        self.text_model = AIMv2TextModel._from_config(config.text_config)
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
@@ -786,7 +766,6 @@ class AIMv2Model(AIMv2PreTrainedModel):
         self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
         self.max_logit_scale = math.log(config.max_logit_scale)
 
-        # Initialize weights and apply final processing
         self.post_init()
 
     @add_start_docstrings_to_model_forward(AIMV2_TEXT_INPUTS_DOCSTRING)
@@ -913,8 +892,8 @@ class AIMv2Model(AIMv2PreTrainedModel):
         >>> import requests
         >>> from transformers import AutoProcessor, AIMv2Model
 
-        >>> model = AIMv2Model.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> model = AIMv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
+        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index bb57e01f101..f38189b546b 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -314,13 +314,13 @@ class AIMv2Output(ModelOutput):
             The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
             similarity scores.
         text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
+            The text embeddings obtained by applying the projection layer to the pooled output of [`AIMv2TextModel`].
         image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
+            The image embeddings obtained by applying the projection layer to the pooled output of [`AIMv2VisionModel`].
         text_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`CLIPTextModel`].
+            The output of the [`AIMv2TextModel`].
         vision_model_output (`BaseModelOutput`):
-            The output of the [`CLIPVisionModel`].
+            The output of the [`AIMv2VisionModel`].
     """
 
     logits_per_image: torch.FloatTensor = None
@@ -594,12 +594,10 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         self.encoder = AIMv2Encoder(config)
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
-        # Use attention pooling head only for lit vairant
         self.use_head = config.use_head
         if self.use_head:
             self.head = AIMv2AttentionPoolingHead(config)
 
-        # Initialize weights and apply final processing
         self.post_init()
 
     def get_input_embeddings(self) -> nn.Module:
@@ -632,9 +630,7 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         last_hidden_state = encoder_outputs[0]
         last_hidden_state = self.rms_norm(last_hidden_state)
 
-        pooler_output = None
-        if self.use_head:
-            pooler_output = self.head(last_hidden_state)
+        pooler_output = self.head(last_hidden_state) if self.use_head else None
 
         output = BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
@@ -658,7 +654,6 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
 
         self.eos_token_id = config.eos_token_id
 
-        # Initialize weights and apply final processing
         self.post_init()
 
     def get_input_embeddings(self) -> nn.Module:
@@ -721,27 +716,12 @@ class AIMv2Model(CLIPModel, nn.Module):
     def __init__(self, config: AIMv2Config):
         nn.Module().__init__(config)
 
-        if not isinstance(config.vision_config, AIMv2VisionConfig):
-            raise TypeError(
-                "config.vision_config is expected to be of type AIMv2VisionConfig but is of type"
-                f" {type(config.vision_config)}."
-            )
-
-        if not isinstance(config.text_config, AIMv2TextConfig):
-            raise TypeError(
-                "config.text_config is expected to be of type AIMv2TextConfig but is of type"
-                f" {type(config.text_config)}."
-            )
-
-        vision_config = config.vision_config
-        text_config = config.text_config
-
         self.projection_dim = config.projection_dim
-        self.vision_embed_dim = vision_config.hidden_size
-        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = config.vision_config.hidden_size
+        self.text_embed_dim = config.text_config.hidden_size
 
-        self.vision_model = AIMv2VisionModel._from_config(vision_config)
-        self.text_model = AIMv2TextModel._from_config(text_config)
+        self.vision_model = AIMv2VisionModel._from_config(config.vision_config)
+        self.text_model = AIMv2TextModel._from_config(config.text_config)
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
@@ -749,7 +729,6 @@ class AIMv2Model(CLIPModel, nn.Module):
         self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
         self.max_logit_scale = math.log(config.max_logit_scale)
 
-        # Initialize weights and apply final processing
         self.post_init()
 
     def forward(
@@ -772,8 +751,8 @@ class AIMv2Model(CLIPModel, nn.Module):
         >>> import requests
         >>> from transformers import AutoProcessor, AIMv2Model
 
-        >>> model = AIMv2Model.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> model = AIMv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
+        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)

From 13ce5c83b234f402275bd8a3bdfce0d2f7ff8c8e Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 29 Mar 2025 15:33:26 +0530
Subject: [PATCH 32/62] nit

---
 src/transformers/models/aimv2/configuration_aimv2.py | 2 +-
 src/transformers/models/aimv2/modular_aimv2.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index 66f858ab0e2..f2d4c5eb925 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -134,7 +134,7 @@ class AIMv2TextConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a
     AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
-    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index f38189b546b..c1cc5c3d851 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -150,7 +150,7 @@ class AIMv2TextConfig(SiglipTextConfig):
     This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a
     AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
-    [google/aimv2-base-patch16-224](https://huggingface.co/google/aimv2-base-patch16-224) architecture.
+    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

From 54d0d95e497272fa52d85bd969806e357d091f4b Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Tue, 1 Apr 2025 07:38:33 +0530
Subject: [PATCH 33/62] nit

---
 tests/models/aimv2/test_modeling_aimv2.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 4d1639faf0e..06446117d39 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -386,14 +386,11 @@ class AIMv2ModelTester:
         config_and_inputs = self.prepare_config_and_inputs()
         config, input_ids, attention_mask, pixel_values = config_and_inputs
 
-        # Set use_head to True for LIT variant
-        # config.vision_config.use_head = True
-
         inputs_dict = {
             "input_ids": input_ids,
             "attention_mask": attention_mask,
             "pixel_values": pixel_values,
-            "return_loss": True,
+            "return_loss": False,
         }
         return config, inputs_dict
 
@@ -609,6 +606,7 @@ class AIMv2ModelIntegrationTest(unittest.TestCase):
 
 
 @require_vision
+@require_torch
 class AIMv2VisionModelIntegrationTests(unittest.TestCase):
     @slow
     def test_inference(self):

From 9f3fd333705b2c3e8449bfcef1a1c3a46667ef2b Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Tue, 1 Apr 2025 16:32:51 +0530
Subject: [PATCH 34/62] make style

---
 src/transformers/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 982a4f97587..dd44128d2d0 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -159,6 +159,11 @@ _import_structure = {
     ],
     # Models
     "models": [],
+    "models.aimv2": [
+        "AIMv2Config",
+        "AIMv2TextConfig",
+        "AIMv2VisionConfig",
+    ],
     "models.albert": ["AlbertConfig"],
     "models.align": [
         "AlignConfig",
@@ -289,11 +294,6 @@ _import_structure = {
         "CLIPTokenizer",
         "CLIPVisionConfig",
     ],
-    "models.aimv2": [
-        "AIMv2Config",
-        "AIMv2TextConfig",
-        "AIMv2VisionConfig",
-    ],
     "models.clipseg": [
         "CLIPSegConfig",
         "CLIPSegProcessor",

From aabc4602f532521b0e07f4d9ae92cdfb02af2999 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Tue, 1 Apr 2025 17:03:33 +0530
Subject: [PATCH 35/62] return_dict refactor

---
 .../models/aimv2/modeling_aimv2.py            | 105 ++++--------------
 .../models/aimv2/modular_aimv2.py             |  38 +++----
 2 files changed, 35 insertions(+), 108 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 618476b7a0d..e81205f3ba6 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -22,7 +22,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -38,8 +38,8 @@ from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     logging,
-    replace_return_docstrings,
 )
 from .configuration_aimv2 import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
 
@@ -370,14 +370,14 @@ class AIMv2Encoder(nn.Module):
         self.gradient_checkpointing = False
 
     # Ignore copy
+    @can_return_tuple
     def forward(
         self,
         inputs_embeds,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -404,7 +404,6 @@ class AIMv2Encoder(nn.Module):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -435,10 +434,10 @@ class AIMv2Encoder(nn.Module):
         if output_hidden_states:
             encoder_states = encoder_states + (hidden_states,)
 
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
         )
 
 
@@ -521,19 +520,18 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
     def get_input_embeddings(self) -> nn.Module:
         return self.embeddings.patch_embed
 
+    @can_return_tuple
     def forward(
         self,
         pixel_values,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
+    ) -> BaseModelOutputWithPooling:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         hidden_states = self.embeddings(pixel_values)
 
@@ -542,7 +540,6 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True,
         )
 
         last_hidden_state = encoder_outputs[0]
@@ -550,15 +547,13 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
 
         pooler_output = self.head(last_hidden_state) if self.use_head else None
 
-        output = BaseModelOutputWithPooling(
+        return BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
             pooler_output=pooler_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
 
-        return output if return_dict else output.to_tuple()
-
 
 class AIMv2TextModel(AIMv2PreTrainedModel):
     main_input_name = "input_ids"
@@ -580,19 +575,18 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
     def set_input_embeddings(self, value):
         self.embeddings.token_embedding = value
 
+    @can_return_tuple
     def forward(
         self,
         input_ids,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
+    ) -> BaseModelOutputWithPooling:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         hidden_states = self.embeddings(input_ids)
         _, seq_len, _ = hidden_states.shape
@@ -608,7 +602,6 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True,
         )
 
         last_hidden_state = encoder_outputs[0]
@@ -620,15 +613,13 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
             (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id).int().argmax(dim=-1),
         ]
 
-        output = BaseModelOutputWithPooling(
+        return BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
 
-        return output if return_dict else output.to_tuple()
-
 
 def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
     """
@@ -705,45 +696,6 @@ AIMV2_VISION_INPUTS_DOCSTRING = r"""
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-AIMV2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`AIMv2ImageProcessor.__call__`] for details.
-        return_loss (`bool`, *optional*):
-            Whether or not to return the contrastive loss.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
-            Whether to interpolate the pre-trained position encodings.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
 
 @add_start_docstrings(AIMV2_START_DOCSTRING)
 class AIMv2Model(AIMv2PreTrainedModel):
@@ -776,7 +728,6 @@ class AIMv2Model(AIMv2PreTrainedModel):
         position_ids: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> torch.FloatTensor:
         r"""
         Returns:
@@ -799,18 +750,16 @@ class AIMv2Model(AIMv2PreTrainedModel):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        text_outputs = self.text_model(
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        pooled_output = text_outputs[1]
+        pooled_output = text_outputs.pooler_output
         text_features = self.text_projection(pooled_output)
 
         return text_features
@@ -822,7 +771,6 @@ class AIMv2Model(AIMv2PreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: bool = False,
-        return_dict: Optional[bool] = None,
     ) -> torch.FloatTensor:
         r"""
         Returns:
@@ -851,23 +799,20 @@ class AIMv2Model(AIMv2PreTrainedModel):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=return_dict,
         )
 
-        pooled_output = vision_outputs[1]  # pooled_output
+        pooled_output = vision_outputs.pooler_output
         image_features = self.visual_projection(pooled_output)
 
         return image_features
 
-    @add_start_docstrings_to_model_forward(AIMV2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=AIMv2Output, config_class=AIMv2Config)
+    @can_return_tuple
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -876,8 +821,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, AIMv2Output]:
+    ) -> AIMv2Output:
         r"""
         Returns:
 
@@ -911,21 +855,18 @@ class AIMv2Model(AIMv2PreTrainedModel):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True,
         )
 
-        text_outputs = self.text_model(
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True,
         )
 
         image_embeds = vision_outputs.pooler_output
@@ -942,7 +883,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
-        output = AIMv2Output(
+        return AIMv2Output(
             logits_per_image=logits_per_image,
             logits_per_text=logits_per_text,
             text_embeds=text_embeds,
@@ -951,7 +892,5 @@ class AIMv2Model(AIMv2PreTrainedModel):
             vision_model_output=vision_outputs,
         )
 
-        return output if return_dict else output.to_tuple()
-
 
 __all__ = ["AIMv2VisionModel", "AIMv2Model", "AIMv2PreTrainedModel", "AIMv2TextModel"]
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index c1cc5c3d851..96805842422 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -17,7 +17,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -30,6 +30,7 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...activations import ACT2FN
 from ...utils import (
     ModelOutput,
+    can_return_tuple,
     logging,
 )
 from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm
@@ -603,19 +604,18 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
     def get_input_embeddings(self) -> nn.Module:
         return self.embeddings.patch_embed
 
+    @can_return_tuple
     def forward(
         self,
         pixel_values,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
+    ) -> BaseModelOutputWithPooling:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         hidden_states = self.embeddings(pixel_values)
 
@@ -624,7 +624,6 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True,
         )
 
         last_hidden_state = encoder_outputs[0]
@@ -632,15 +631,13 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
 
         pooler_output = self.head(last_hidden_state) if self.use_head else None
 
-        output = BaseModelOutputWithPooling(
+        return BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
             pooler_output=pooler_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
 
-        return output if return_dict else output.to_tuple()
-
 
 class AIMv2TextModel(AIMv2PreTrainedModel):
     main_input_name = "input_ids"
@@ -662,19 +659,18 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
     def set_input_embeddings(self, value):
         self.embeddings.token_embedding = value
 
+    @can_return_tuple
     def forward(
         self,
         input_ids,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
+    ) -> BaseModelOutputWithPooling:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         hidden_states = self.embeddings(input_ids)
         _, seq_len, _ = hidden_states.shape
@@ -690,7 +686,6 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True,
         )
 
         last_hidden_state = encoder_outputs[0]
@@ -702,15 +697,13 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
             (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id).int().argmax(dim=-1),
         ]
 
-        output = BaseModelOutputWithPooling(
+        return BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
 
-        return output if return_dict else output.to_tuple()
-
 
 class AIMv2Model(CLIPModel, nn.Module):
     def __init__(self, config: AIMv2Config):
@@ -731,6 +724,7 @@ class AIMv2Model(CLIPModel, nn.Module):
 
         self.post_init()
 
+    @can_return_tuple
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -739,8 +733,7 @@ class AIMv2Model(CLIPModel, nn.Module):
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, AIMv2Output]:
+    ) -> AIMv2Output:
         r"""
         Returns:
 
@@ -770,21 +763,18 @@ class AIMv2Model(CLIPModel, nn.Module):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True,
         )
 
-        text_outputs = self.text_model(
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=True,
         )
 
         image_embeds = vision_outputs.pooler_output
@@ -801,7 +791,7 @@ class AIMv2Model(CLIPModel, nn.Module):
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
-        output = AIMv2Output(
+        return AIMv2Output(
             logits_per_image=logits_per_image,
             logits_per_text=logits_per_text,
             text_embeds=text_embeds,
@@ -810,8 +800,6 @@ class AIMv2Model(CLIPModel, nn.Module):
             vision_model_output=vision_outputs,
         )
 
-        return output if return_dict else output.to_tuple()
-
 
 __all__ = [
     "AIMv2Config",

From 8f85bf8bf0adaa9cd3e14cd139b3834063bcd388 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Tue, 1 Apr 2025 22:22:33 +0530
Subject: [PATCH 36/62] bug fix

---
 src/transformers/models/aimv2/modeling_aimv2.py | 5 +++--
 src/transformers/models/aimv2/modular_aimv2.py  | 4 ++--
 tests/models/aimv2/test_modeling_aimv2.py       | 1 +
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index e81205f3ba6..01cfb8b5aad 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -297,7 +297,7 @@ class AIMv2Attention(nn.Module):
 
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
-            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+            if self.config._attn_implementation == "sdpa" and output_attentions:
                 logger.warning_once(
                     "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
                     'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
@@ -478,7 +478,7 @@ class AIMv2AttentionPoolingHead(nn.Module):
 class AIMv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
+    models. The model is only intended for inference and doesn't support finetuning.
     """
 
     config_class = AIMv2Config
@@ -503,6 +503,7 @@ class AIMv2PreTrainedModel(PreTrainedModel):
 
 class AIMv2VisionModel(AIMv2PreTrainedModel):
     main_input_name = "pixel_values"
+    base_model_prefix = "aimv2_vision_model"
 
     def __init__(self, config: AIMv2VisionConfig):
         super().__init__(config)
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 96805842422..9a9d4a169de 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -464,7 +464,7 @@ class AIMv2Attention(nn.Module):
 
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
-            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+            if self.config._attn_implementation == "sdpa" and output_attentions:
                 logger.warning_once(
                     "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
                     'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
@@ -562,7 +562,7 @@ class AIMv2AttentionPoolingHead(nn.Module):
 class AIMv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
+    models. The model is only intended for inference and doesn't support finetuning.
     """
 
     config_class = AIMv2Config
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 06446117d39..2c03a91fdad 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -312,6 +312,7 @@ class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
     fx_compatible = False
     test_pruning = False
     test_head_masking = False
+    test_resize_embeddings = False
 
     def setUp(self):
         self.model_tester = AIMv2TextModelTester(self)

From 4fdb728d9737315e9f7f23e27d46669c540efdf4 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 2 Apr 2025 18:36:06 +0530
Subject: [PATCH 37/62] fix

---
 src/transformers/models/aimv2/modeling_aimv2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 01cfb8b5aad..c14706c15dc 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -503,7 +503,6 @@ class AIMv2PreTrainedModel(PreTrainedModel):
 
 class AIMv2VisionModel(AIMv2PreTrainedModel):
     main_input_name = "pixel_values"
-    base_model_prefix = "aimv2_vision_model"
 
     def __init__(self, config: AIMv2VisionConfig):
         super().__init__(config)

From b316999e1ad5f455fe5272c912e6278ee39ce922 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 2 Apr 2025 18:36:11 +0530
Subject: [PATCH 38/62] doc update

---
 docs/source/en/model_doc/aimv2.md | 63 ++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/model_doc/aimv2.md b/docs/source/en/model_doc/aimv2.md
index c6c35e0d12b..51bf9b413ae 100644
--- a/docs/source/en/model_doc/aimv2.md
+++ b/docs/source/en/model_doc/aimv2.md
@@ -18,20 +18,71 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The AIMv2 model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+The AIMv2 model was proposed in [Multimodal Autoregressive Pre-training of Large Vision Encoders](https://arxiv.org/abs/2411.14402) by Enrico Fini, Mustafa Shukor, Xiujun Li, Philipp Dufter, Michal Klein, David Haldimann, Sai Aitharaju, Victor Guilherme Turrisi da Costa, Louis Béthune, Zhe Gan, Alexander T Toshev, Marcin Eichner, Moin Nabi, Yinfei Yang, Joshua M. Susskind, Alaaeldin El-Nouby.
 <INSERT SHORT SUMMARY HERE>
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*We introduce a novel method for pre-training of large-scale vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this framework to a multimodal setting, i.e., images and text. In this paper, we present AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks. This is achieved by pairing the vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our encoders excel not only in multimodal evaluations but also in vision benchmarks such as localization, grounding, and classification. Notably, our AIMV2-3B encoder achieves 89.5% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings.*
 
-Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+This model was contributed by [Yaswanth Gali](https://huggingface.co/yaswanthgali).
+The original code can be found [here](https://github.com/apple/ml-aim).
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+## Usage Example
 
+Here is an example of Image Feature Extraction using specific checkpoints on resized images and native resolution images:
+
+```python
+import requests
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModel
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+processor = AutoImageProcessor.from_pretrained(
+    "apple/aimv2-large-patch14-native",
+)
+model = AutoModel.from_pretrained(
+    "apple/aimv2-large-patch14-native",
+    trust_remote_code=True,
+)
+
+inputs = processor(images=image, return_tensors="pt")
+outputs = model(**inputs)
+```
+
+Here is an example of checkpoint performing zero shot classification:
+
+```python
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+text = ["Picture of a dog.", "Picture of a cat.", "Picture of a horse."]
+
+processor = AutoProcessor.from_pretrained(
+    "apple/aimv2-large-patch14-224-lit",
+)
+model = AutoModel.from_pretrained(
+    "apple/aimv2-large-patch14-224-lit",
+    trust_remote_code=True,
+)
+
+inputs = processor(
+    images=image,
+    text=text,
+    add_special_tokens=True,
+    truncation=True,
+    padding=True,
+    return_tensors="pt",
+)
+outputs = model(**inputs)
+probs = outputs.logits_per_image.softmax(dim=-1)
+```
 
 ## AIMv2Config
 

From ec510c33a1e1568b5afdc1ff72f4dbe66a12677e Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Mon, 7 Apr 2025 14:18:51 +0530
Subject: [PATCH 39/62] nit

---
 src/transformers/models/aimv2/modeling_aimv2.py | 4 ----
 src/transformers/models/aimv2/modular_aimv2.py  | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index c14706c15dc..fc2aab348db 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -826,10 +826,6 @@ class AIMv2Model(AIMv2PreTrainedModel):
         Returns:
 
         Examples:
-        Returns:
-
-        Examples:
-
 
         ```python
         >>> from PIL import Image
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 9a9d4a169de..c44ecad523e 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -735,10 +735,6 @@ class AIMv2Model(CLIPModel, nn.Module):
         output_hidden_states: Optional[bool] = None,
     ) -> AIMv2Output:
         r"""
-        Returns:
-
-        Examples:
-
         ```python
         >>> from PIL import Image
         >>> import requests

From f2d47eb0a270060bfee9ff7a7dbaf44765043e3b Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sat, 12 Apr 2025 18:40:33 +0530
Subject: [PATCH 40/62] make fixup

---
 src/transformers/__init__.py                    | 2 +-
 src/transformers/models/__init__.py             | 2 +-
 src/transformers/models/aimv2/modeling_aimv2.py | 2 ++
 src/transformers/utils/dummy_pt_objects.py      | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4a4e8611ac2..75f03315f9c 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1027,4 +1027,4 @@ if not is_tf_available() and not is_torch_available() and not is_flax_available(
         "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. "
         "Models won't be available and only tokenizers, configuration "
         "and file/data utilities can be used."
-    )
\ No newline at end of file
+    )
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index d6d37bebbe2..25af34ce3f2 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -329,4 +329,4 @@ else:
     import sys
 
     _file = globals()["__file__"]
-    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
\ No newline at end of file
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index fc2aab348db..dcc6058a17b 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -33,6 +33,7 @@ from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_outputs import BaseModelOutput
 from ...utils import (
     ModelOutput,
@@ -81,6 +82,7 @@ class AIMv2Output(ModelOutput):
         )
 
 
+@use_kernel_forward_from_hub("RMSNorm")
 class AIMv2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index e60b1e32768..55c592082c5 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -640,4 +640,4 @@ class Seq2SeqTrainer(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
\ No newline at end of file
+        requires_backends(self, ["torch"])

From ce67e589ba640657e1d45f209b27d84797a3d5d1 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Sun, 13 Apr 2025 22:23:35 +0530
Subject: [PATCH 41/62] Minor update

---
 src/transformers/models/aimv2/configuration_aimv2.py |  8 --------
 src/transformers/models/aimv2/modeling_aimv2.py      |  3 +--
 src/transformers/models/aimv2/modular_aimv2.py       | 11 +----------
 tests/models/aimv2/test_modeling_aimv2.py            | 11 +++++++++++
 4 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index f2d4c5eb925..a3498f371e3 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -68,8 +68,6 @@ class AIMv2VisionConfig(PretrainedConfig):
             The standard deviation of the for initializing all weight matrices.
         use_head (`str`, *optional*, defaults to `True`):
             Whether to use Attention Pooling Head or Not.
-        is_causal (`bool`, *optional*, defaults to `False`):
-            Whether to apply causal masking in scaled dot-product attention.
     Example:
 
     ```python
@@ -105,7 +103,6 @@ class AIMv2VisionConfig(PretrainedConfig):
         hidden_act: str = "silu",
         initializer_range: float = 0.02,
         use_head: bool = True,
-        is_causal: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -126,7 +123,6 @@ class AIMv2VisionConfig(PretrainedConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
-        self.is_causal = is_causal
 
 
 class AIMv2TextConfig(PretrainedConfig):
@@ -175,8 +171,6 @@ class AIMv2TextConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the for initializing all weight matrices.
-        is_causal (`bool`, *optional*, defaults to `True`):
-            Whether to apply causal masking in scaled dot-product attention.
     """
 
     model_type = "aimv2_text_model"
@@ -200,7 +194,6 @@ class AIMv2TextConfig(PretrainedConfig):
         eos_token_id: int = 49407,
         max_position_embeddings: int = 77,
         initializer_range: bool = 0.02,
-        is_causal: bool = True,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -219,7 +212,6 @@ class AIMv2TextConfig(PretrainedConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
-        self.is_causal = is_causal
 
 
 class AIMv2Config(PretrainedConfig):
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index dcc6058a17b..caf9cfc4a83 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -276,7 +276,7 @@ class AIMv2Attention(nn.Module):
         self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.proj_drop = nn.Dropout(config.projection_dropout)
 
-        self.is_causal = config.is_causal
+        self.is_causal = False
 
     def forward(
         self,
@@ -539,7 +539,6 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
         )
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index c44ecad523e..9f6d8f80cd3 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -84,8 +84,6 @@ class AIMv2VisionConfig(SiglipVisionConfig):
             The standard deviation of the for initializing all weight matrices.
         use_head (`str`, *optional*, defaults to `True`):
             Whether to use Attention Pooling Head or Not.
-        is_causal (`bool`, *optional*, defaults to `False`):
-            Whether to apply causal masking in scaled dot-product attention.
     Example:
 
     ```python
@@ -118,7 +116,6 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         hidden_act: str = "silu",
         initializer_range: float = 0.02,
         use_head: bool = True,
-        is_causal: bool = False,
         **kwargs,
     ):
         super().__init__(
@@ -141,7 +138,6 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
-        self.is_causal = is_causal
 
         del self.layer_norm_eps
 
@@ -192,8 +188,6 @@ class AIMv2TextConfig(SiglipTextConfig):
             just in case (e.g., 512 or 1024 or 2048).
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the for initializing all weight matrices.
-        is_causal (`bool`, *optional*, defaults to `True`):
-            Whether to apply causal masking in scaled dot-product attention.
     """
 
     def __init__(
@@ -214,7 +208,6 @@ class AIMv2TextConfig(SiglipTextConfig):
         eos_token_id: int = 49407,
         max_position_embeddings: int = 77,
         initializer_range: bool = 0.02,
-        is_causal: bool = True,
         **kwargs,
     ):
         super().__init__(
@@ -237,7 +230,6 @@ class AIMv2TextConfig(SiglipTextConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
-        self.is_causal = is_causal
 
         del self.bos_token_id
         del self.pad_token_id
@@ -441,7 +433,7 @@ class AIMv2Attention(nn.Module):
         self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.proj_drop = nn.Dropout(config.projection_dropout)
 
-        self.is_causal = config.is_causal
+        self.is_causal = False
 
     def forward(
         self,
@@ -621,7 +613,6 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
         )
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 2c03a91fdad..f0d879bb706 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -20,10 +20,12 @@ import unittest
 
 import numpy as np
 import requests
+from parameterized import parameterized
 from pytest import mark
 
 from transformers import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
 from transformers.testing_utils import (
+    is_flaky,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
@@ -38,11 +40,13 @@ from transformers.utils import (
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
     ModelTesterMixin,
     _config_zero_init,
     floats_tensor,
     ids_tensor,
     random_attention_mask,
+    require_torch_sdpa,
 )
 from ...test_pipeline_mixin import PipelineTesterMixin
 
@@ -572,6 +576,13 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
                     f"Text logits max diff: {torch.max(torch.abs(logits_per_text_eager - logits_per_text_sdpa))}",
                 )
 
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @require_torch_sdpa
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, *args):
+        # Adding only flaky decorator here and call the parent test method
+        return getattr(ModelTesterMixin, self._testMethodName)(self)
+
 
 @require_vision
 @require_torch

From 67ed7118d192202ed10a0222d1d7246ea60735a8 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Thu, 17 Apr 2025 16:32:38 +0530
Subject: [PATCH 42/62] _init_weigths modifcation

---
 src/transformers/models/aimv2/modeling_aimv2.py | 5 +++++
 src/transformers/models/aimv2/modular_aimv2.py  | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index caf9cfc4a83..bab3d06ad93 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -499,8 +499,13 @@ class AIMv2PreTrainedModel(PreTrainedModel):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
+        elif isinstance(module, AIMv2RMSNorm):
+            module.weight.data.fill_(1.0)
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
+        elif hasattr(module, "logit_scale"):
+            if isinstance(module.logit_scale, nn.Parameter):
+                module.logit_scale.data.fill_(math.log(1 / 0.07))
 
 
 class AIMv2VisionModel(AIMv2PreTrainedModel):
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 9f6d8f80cd3..f96ec8b9c59 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -573,8 +573,13 @@ class AIMv2PreTrainedModel(PreTrainedModel):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
+        elif isinstance(module, AIMv2RMSNorm):
+            module.weight.data.fill_(1.0)
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
+        elif hasattr(module, "logit_scale"):
+            if isinstance(module.logit_scale, nn.Parameter):
+                module.logit_scale.data.fill_(math.log(1 / 0.07))
 
 
 class AIMv2VisionModel(AIMv2PreTrainedModel):

From 6277203fe06e7ba7a6cc19d15cd669a21703b628 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 23 Apr 2025 20:41:21 +0530
Subject: [PATCH 43/62] update tests

---
 tests/models/aimv2/test_modeling_aimv2.py | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index f0d879bb706..1261946c998 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -76,7 +76,7 @@ class AIMv2VisionModelTester:
         image_size=30,
         patch_size=2,
         num_channels=3,
-        is_training=True,
+        is_training=False,
         hidden_size=32,
         projection_dim=32,
         num_hidden_layers=2,
@@ -233,7 +233,7 @@ class AIMv2TextModelTester:
         parent,
         batch_size=12,
         seq_length=7,
-        is_training=True,
+        is_training=False,
         use_input_mask=True,
         use_labels=True,
         vocab_size=99,
@@ -329,29 +329,13 @@ class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(reason="This model has no Loss")
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(reason="This model has no Loss")
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
     @unittest.skip(reason="AIMv2 does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
 
 class AIMv2ModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=False):
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:

From 9af376477101cc9588d9cee0fded91cb1b6478f7 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 23 Apr 2025 20:41:56 +0530
Subject: [PATCH 44/62] Minor fixes post review

---
 docs/source/en/model_doc/aimv2.md             | 21 ++-----
 .../models/aimv2/configuration_aimv2.py       |  4 ++
 .../convert_aimv2_original_pytorch_to_hf.py   | 11 ++--
 .../models/aimv2/modeling_aimv2.py            | 25 ++++++---
 .../models/aimv2/modular_aimv2.py             | 56 ++++++-------------
 .../models/auto/image_processing_auto.py      |  2 +
 .../models/auto/processing_auto.py            |  2 +
 7 files changed, 50 insertions(+), 71 deletions(-)

diff --git a/docs/source/en/model_doc/aimv2.md b/docs/source/en/model_doc/aimv2.md
index 51bf9b413ae..7db1c291bc4 100644
--- a/docs/source/en/model_doc/aimv2.md
+++ b/docs/source/en/model_doc/aimv2.md
@@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.
 ## Overview
 
 The AIMv2 model was proposed in [Multimodal Autoregressive Pre-training of Large Vision Encoders](https://arxiv.org/abs/2411.14402) by Enrico Fini, Mustafa Shukor, Xiujun Li, Philipp Dufter, Michal Klein, David Haldimann, Sai Aitharaju, Victor Guilherme Turrisi da Costa, Louis Béthune, Zhe Gan, Alexander T Toshev, Marcin Eichner, Moin Nabi, Yinfei Yang, Joshua M. Susskind, Alaaeldin El-Nouby.
-<INSERT SHORT SUMMARY HERE>
 
 The abstract from the paper is the following:
 
@@ -41,19 +40,14 @@ from transformers import AutoImageProcessor, AutoModel
 url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 
-processor = AutoImageProcessor.from_pretrained(
-    "apple/aimv2-large-patch14-native",
-)
-model = AutoModel.from_pretrained(
-    "apple/aimv2-large-patch14-native",
-    trust_remote_code=True,
-)
+processor = AutoImageProcessor.from_pretrained("apple/aimv2-large-patch14-native")
+model = AutoModel.from_pretrained("apple/aimv2-large-patch14-native")
 
 inputs = processor(images=image, return_tensors="pt")
 outputs = model(**inputs)
 ```
 
-Here is an example of checkpoint performing zero shot classification:
+Here is an example of a checkpoint performing zero-shot classification:
 
 ```python
 import requests
@@ -64,13 +58,8 @@ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 text = ["Picture of a dog.", "Picture of a cat.", "Picture of a horse."]
 
-processor = AutoProcessor.from_pretrained(
-    "apple/aimv2-large-patch14-224-lit",
-)
-model = AutoModel.from_pretrained(
-    "apple/aimv2-large-patch14-224-lit",
-    trust_remote_code=True,
-)
+processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")
+model = AutoModel.from_pretrained("apple/aimv2-large-patch14-224-lit")
 
 inputs = processor(
     images=image,
diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index a3498f371e3..5810880823e 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -68,6 +68,8 @@ class AIMv2VisionConfig(PretrainedConfig):
             The standard deviation of the for initializing all weight matrices.
         use_head (`str`, *optional*, defaults to `True`):
             Whether to use Attention Pooling Head or Not.
+        is_native (`str`, *optional*, defaults to `False`):
+            Whether to use ckpt trained for image native resolution or not.
     Example:
 
     ```python
@@ -103,6 +105,7 @@ class AIMv2VisionConfig(PretrainedConfig):
         hidden_act: str = "silu",
         initializer_range: float = 0.02,
         use_head: bool = True,
+        is_native: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -123,6 +126,7 @@ class AIMv2VisionConfig(PretrainedConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
+        self.is_native = is_native
 
 
 class AIMv2TextConfig(PretrainedConfig):
diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index ce9e2540b13..d53796bc200 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -164,13 +164,14 @@ def write_model(
     if hf_repo_id != "apple/aimv2-large-patch14-224-lit":
         config.use_head = False
 
+    if hf_repo_id == "apple/aimv2-large-patch14-native":
+        config.is_native = True
+
     original_state_dict = load_original_state_dict(hf_repo_id)
 
     print("Converting model...")
 
     state_dict = {}
-    # For `apple/aimv2-large-patch14-native` we don't have position_embedding in state_dict
-    strict_loading = False
     result = convert_old_keys_to_new_keys(original_state_dict, key_mapping)
     all_keys = list(original_state_dict.keys())
 
@@ -187,19 +188,17 @@ def write_model(
         # Check if position embeddings exist before squeezing
         if new_key.endswith("position_embedding.weight"):
             state_dict[new_key] = value.squeeze(0)
-            strict_loading = True
 
     print(f"Loading the checkpoint in a {model_class.__name__}.")
     model = model_class(config)
-    model.load_state_dict(state_dict, strict=strict_loading, assign=True)
+    model.load_state_dict(state_dict, strict=True, assign=True)
     print("Checkpoint loaded successfully.")
 
     print("Saving the model.")
     model.save_pretrained(output_dir, safe_serialization=safe_serialization)
     del state_dict, model
-
-    # Safety check: reload the converted model
     gc.collect()
+
     print("Reloading the model to check if it's saved correctly.")
     model = model_class.from_pretrained(output_dir, device_map="auto")
     print("Model reloaded successfully.")
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index bab3d06ad93..7515fc98372 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -52,6 +52,8 @@ logger = logging.get_logger(__name__)
 class AIMv2Output(ModelOutput):
     """
     Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
         logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
             The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
             similarity scores.
@@ -64,14 +66,15 @@ class AIMv2Output(ModelOutput):
             The image embeddings obtained by applying the projection layer to the pooled output of [`AIMv2VisionModel`].
         text_model_output (`BaseModelOutputWithPooling`):
             The output of the [`AIMv2TextModel`].
-        vision_model_output (`BaseModelOutput`):
+        vision_model_output (`BaseModelOutputWithPooling`):
             The output of the [`AIMv2VisionModel`].
     """
 
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -133,7 +136,8 @@ class AIMv2VisionEmbeddings(nn.Module):
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         num_patches = (config.image_size // config.patch_size) ** 2
-        self.position_embedding = nn.Embedding(num_patches, config.hidden_size)
+        if not self.config.is_native:
+            self.position_embedding = nn.Embedding(num_patches, config.hidden_size)
         self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
 
     @staticmethod
@@ -158,7 +162,7 @@ class AIMv2VisionEmbeddings(nn.Module):
         hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2)
         hidden_states = self.rms_norm(hidden_states)
 
-        if self.config.image_size != height or self.config.image_size != width:
+        if self.config.is_native:
             pos_embed = self.build_2d_sincos_position_embedding(
                 height // self.patch_size,
                 width // self.patch_size,
@@ -506,6 +510,8 @@ class AIMv2PreTrainedModel(PreTrainedModel):
         elif hasattr(module, "logit_scale"):
             if isinstance(module.logit_scale, nn.Parameter):
                 module.logit_scale.data.fill_(math.log(1 / 0.07))
+        elif isinstance(module, AIMv2AttentionPoolingHead):
+            module.cls_token.data.normal_(mean=0.0, std=std)
 
 
 class AIMv2VisionModel(AIMv2PreTrainedModel):
@@ -516,6 +522,7 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         self.config = config
         self.embeddings = AIMv2VisionEmbeddings(config)
         self.encoder = AIMv2Encoder(config)
+        # The only change from SiglipVisionTransformer is, layernorm -> rms_norm.
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         self.use_head = config.use_head
@@ -722,7 +729,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
 
         self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
-        self.max_logit_scale = math.log(config.max_logit_scale)
+        self.max_log_logit_scale = math.log(config.max_logit_scale)
 
         self.post_init()
 
@@ -881,7 +888,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         image_embeds = image_embeds / _get_vector_norm(image_embeds)
         text_embeds = text_embeds / _get_vector_norm(text_embeds)
 
-        logit_scale = self.logit_scale.clamp(0.0, self.max_logit_scale).exp()
+        logit_scale = self.logit_scale.clamp(0.0, self.max_log_logit_scale).exp()
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index f96ec8b9c59..3948454cc08 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -16,8 +16,7 @@
 """Pytorch implementation of AIMv2 Model"""
 
 import math
-from dataclasses import dataclass
-from typing import Any, Callable, Optional, Tuple
+from typing import Callable, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -29,14 +28,13 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
 from ...utils import (
-    ModelOutput,
     can_return_tuple,
     logging,
 )
 from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm
 from ..llama.modeling_llama import LlamaRMSNorm, eager_attention_forward
 from ..siglip.configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
-from ..siglip.modeling_siglip import SiglipEncoder
+from ..siglip.modeling_siglip import SiglipEncoder, SiglipOutput
 
 
 logger = logging.get_logger(__name__)
@@ -84,6 +82,8 @@ class AIMv2VisionConfig(SiglipVisionConfig):
             The standard deviation of the for initializing all weight matrices.
         use_head (`str`, *optional*, defaults to `True`):
             Whether to use Attention Pooling Head or Not.
+        is_native (`str`, *optional*, defaults to `False`):
+            Whether to use ckpt trained for image native resolution or not.
     Example:
 
     ```python
@@ -116,6 +116,7 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         hidden_act: str = "silu",
         initializer_range: float = 0.02,
         use_head: bool = True,
+        is_native: bool = False,
         **kwargs,
     ):
         super().__init__(
@@ -138,6 +139,7 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
         self.projection_dropout = projection_dropout
+        self.is_native = is_native
 
         del self.layer_norm_eps
 
@@ -296,38 +298,8 @@ class AIMv2Config(SiglipConfig):
     pass
 
 
-@dataclass
-class AIMv2Output(ModelOutput):
-    """
-    Args:
-        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
-            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
-            similarity scores.
-        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
-            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
-            similarity scores.
-        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`AIMv2TextModel`].
-        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`AIMv2VisionModel`].
-        text_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`AIMv2TextModel`].
-        vision_model_output (`BaseModelOutput`):
-            The output of the [`AIMv2VisionModel`].
-    """
-
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
-    text_model_output: BaseModelOutputWithPooling = None
-    vision_model_output: BaseModelOutputWithPooling = None
-
-    def to_tuple(self) -> Tuple[Any]:
-        return tuple(
-            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
-            for k in self.keys()
-        )
+class AIMv2Output(SiglipOutput):
+    pass
 
 
 class AIMv2RMSNorm(LlamaRMSNorm):
@@ -364,7 +336,8 @@ class AIMv2VisionEmbeddings(nn.Module):
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         num_patches = (config.image_size // config.patch_size) ** 2
-        self.position_embedding = nn.Embedding(num_patches, config.hidden_size)
+        if not self.config.is_native:
+            self.position_embedding = nn.Embedding(num_patches, config.hidden_size)
         self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)
 
     @staticmethod
@@ -389,7 +362,7 @@ class AIMv2VisionEmbeddings(nn.Module):
         hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2)
         hidden_states = self.rms_norm(hidden_states)
 
-        if self.config.image_size != height or self.config.image_size != width:
+        if self.config.is_native:
             pos_embed = self.build_2d_sincos_position_embedding(
                 height // self.patch_size,
                 width // self.patch_size,
@@ -580,6 +553,8 @@ class AIMv2PreTrainedModel(PreTrainedModel):
         elif hasattr(module, "logit_scale"):
             if isinstance(module.logit_scale, nn.Parameter):
                 module.logit_scale.data.fill_(math.log(1 / 0.07))
+        elif isinstance(module, AIMv2AttentionPoolingHead):
+            module.cls_token.data.normal_(mean=0.0, std=std)
 
 
 class AIMv2VisionModel(AIMv2PreTrainedModel):
@@ -590,6 +565,7 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         self.config = config
         self.embeddings = AIMv2VisionEmbeddings(config)
         self.encoder = AIMv2Encoder(config)
+        # The only change from SiglipVisionTransformer is, layernorm -> rms_norm.
         self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         self.use_head = config.use_head
@@ -716,7 +692,7 @@ class AIMv2Model(CLIPModel, nn.Module):
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
 
         self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
-        self.max_logit_scale = math.log(config.max_logit_scale)
+        self.max_log_logit_scale = math.log(config.max_logit_scale)
 
         self.post_init()
 
@@ -779,7 +755,7 @@ class AIMv2Model(CLIPModel, nn.Module):
         image_embeds = image_embeds / _get_vector_norm(image_embeds)
         text_embeds = text_embeds / _get_vector_norm(text_embeds)
 
-        logit_scale = self.logit_scale.clamp(0.0, self.max_logit_scale).exp()
+        logit_scale = self.logit_scale.clamp(0.0, self.max_log_logit_scale).exp()
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 10ee95475ed..4acd5e9849f 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -56,6 +56,8 @@ if TYPE_CHECKING:
 else:
     IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
         [
+            ("aimv2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
+            ("aimv2_vision_model", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("align", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
             ("aria", ("AriaImageProcessor",)),
             ("beit", ("BeitImageProcessor",)),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index c55a4ab2129..489a5cbdd7c 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -45,6 +45,8 @@ logger = logging.get_logger(__name__)
 
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
+        ("aimv2", "CLIPProcessor"),
+        ("aimv2_vision_model", "CLIPProcessor"),
         ("align", "AlignProcessor"),
         ("altclip", "AltCLIPProcessor"),
         ("aria", "AriaProcessor"),

From 705fa3c72e2a7fa8870b913f66069ed6fbe32972 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 23 Apr 2025 20:51:48 +0530
Subject: [PATCH 45/62] Update w.r.t GradientCheckpointingLayer

---
 .../models/aimv2/modeling_aimv2.py            | 22 +++++++------------
 .../models/aimv2/modular_aimv2.py             |  3 ++-
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 7515fc98372..bb7814fbc69 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -34,6 +34,7 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
 from ...integrations import use_kernel_forward_from_hub
+from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput
 from ...utils import (
     ModelOutput,
@@ -333,7 +334,7 @@ class AIMv2Attention(nn.Module):
         return output
 
 
-class AIMv2EncoderLayer(nn.Module):
+class AIMv2EncoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         self.attention = AIMv2Attention(config)
@@ -418,19 +419,12 @@ class AIMv2Encoder(nn.Module):
         for encoder_layer in self.layers:
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    encoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = encoder_layer(
-                    hidden_states,
-                    attention_mask,
-                    output_attentions=output_attentions,
-                )
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+            )
 
             hidden_states = layer_outputs[0]
 
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 3948454cc08..89d055f2512 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -27,6 +27,7 @@ from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 
 from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
 from ...utils import (
     can_return_tuple,
     logging,
@@ -459,7 +460,7 @@ class AIMv2Attention(nn.Module):
         return output
 
 
-class AIMv2EncoderLayer(nn.Module):
+class AIMv2EncoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         self.attention = AIMv2Attention(config)

From b7e9236cd7780ea68acb6d02619fc9af86211376 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 23 Apr 2025 22:03:28 +0530
Subject: [PATCH 46/62] docs update

---
 .../models/aimv2/modeling_aimv2.py            | 103 +++++++++++++++---
 .../models/aimv2/modular_aimv2.py             |  65 ++++++++++-
 2 files changed, 149 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index bb7814fbc69..81d545ad9c1 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -42,6 +42,7 @@ from ...utils import (
     add_start_docstrings_to_model_forward,
     can_return_tuple,
     logging,
+    replace_return_docstrings,
 )
 from .configuration_aimv2 import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
 
@@ -144,7 +145,7 @@ class AIMv2VisionEmbeddings(nn.Module):
     @staticmethod
     def build_2d_sincos_position_embedding(
         height, width, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
-    ):
+    ) -> torch.Tensor:
         grid_w = torch.arange(int(width), dtype=dtype, device=device)
         grid_h = torch.arange(int(height), dtype=dtype, device=device)
         grid_h, grid_w = torch.meshgrid(grid_w, grid_h, indexing="xy")
@@ -508,6 +509,41 @@ class AIMv2PreTrainedModel(PreTrainedModel):
             module.cls_token.data.normal_(mean=0.0, std=std)
 
 
+AIMV2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`AIMv2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+AIMV2_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`AIMv2ImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+            Whether to interpolate the pre-trained position encodings.
+"""
+
+
+@add_start_docstrings(
+    """The vision model from AIMv2 without any head or projection on top.""",
+    AIMV2_START_DOCSTRING,
+)
 class AIMv2VisionModel(AIMv2PreTrainedModel):
     main_input_name = "pixel_values"
 
@@ -529,6 +565,8 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         return self.embeddings.patch_embed
 
     @can_return_tuple
+    @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AIMv2VisionConfig)
     def forward(
         self,
         pixel_values,
@@ -536,6 +574,28 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Siglip2VisionModel
+
+        >>> model = AIMv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
+        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled features
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -639,21 +699,6 @@ def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
     return normed_tensor
 
 
-AIMV2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`AIMv2Config`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
 AIMV2_TEXT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -686,11 +731,33 @@ AIMV2_TEXT_INPUTS_DOCSTRING = r"""
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-AIMV2_VISION_INPUTS_DOCSTRING = r"""
+AIMV2_INPUTS_DOCSTRING = r"""
     Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
             [`AutoImageProcessor`]. See [`AIMv2ImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -820,6 +887,8 @@ class AIMv2Model(AIMv2PreTrainedModel):
         return image_features
 
     @can_return_tuple
+    @add_start_docstrings_to_model_forward(AIMV2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=AIMv2Output, config_class=AIMv2Config)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 89d055f2512..ba0db0eb725 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -29,8 +29,11 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
     can_return_tuple,
     logging,
+    replace_return_docstrings,
 )
 from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm
 from ..llama.modeling_llama import LlamaRMSNorm, eager_attention_forward
@@ -344,7 +347,7 @@ class AIMv2VisionEmbeddings(nn.Module):
     @staticmethod
     def build_2d_sincos_position_embedding(
         height, width, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
-    ):
+    ) -> torch.Tensor:
         grid_w = torch.arange(int(width), dtype=dtype, device=device)
         grid_h = torch.arange(int(height), dtype=dtype, device=device)
         grid_h, grid_w = torch.meshgrid(grid_w, grid_h, indexing="xy")
@@ -525,6 +528,37 @@ class AIMv2AttentionPoolingHead(nn.Module):
         return output
 
 
+AIMV2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`AIMv2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+AIMV2_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`AIMv2ImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+            Whether to interpolate the pre-trained position encodings.
+"""
+
+
 class AIMv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -558,6 +592,10 @@ class AIMv2PreTrainedModel(PreTrainedModel):
             module.cls_token.data.normal_(mean=0.0, std=std)
 
 
+@add_start_docstrings(
+    """The vision model from AIMv2 without any head or projection on top.""",
+    AIMV2_START_DOCSTRING,
+)
 class AIMv2VisionModel(AIMv2PreTrainedModel):
     main_input_name = "pixel_values"
 
@@ -579,6 +617,8 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         return self.embeddings.patch_embed
 
     @can_return_tuple
+    @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AIMv2VisionConfig)
     def forward(
         self,
         pixel_values,
@@ -586,6 +626,28 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Siglip2VisionModel
+
+        >>> model = AIMv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
+        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled features
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -697,7 +759,6 @@ class AIMv2Model(CLIPModel, nn.Module):
 
         self.post_init()
 
-    @can_return_tuple
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,

From ef9f9081dd6b73b72ceff9d324f9b1eae22aeb73 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Thu, 24 Apr 2025 05:22:49 +0530
Subject: [PATCH 47/62] update

---
 .../models/aimv2/modeling_aimv2.py            | 23 +++++++++++++++----
 .../models/aimv2/modular_aimv2.py             |  8 +++----
 .../models/auto/processing_auto.py            |  1 -
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 81d545ad9c1..454cac875df 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -509,7 +509,7 @@ class AIMv2PreTrainedModel(PreTrainedModel):
             module.cls_token.data.normal_(mean=0.0, std=std)
 
 
-AIMV2_START_DOCSTRING = r"""
+AIMV2_VISION_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -519,7 +519,7 @@ AIMV2_START_DOCSTRING = r"""
     and behavior.
 
     Parameters:
-        config ([`AIMv2Config`]): Model configuration class with all the parameters of the model.
+        config ([`AIMv2VisionConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -528,7 +528,7 @@ AIMV2_VISION_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`AIMv2ImageProcessor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -542,7 +542,7 @@ AIMV2_VISION_INPUTS_DOCSTRING = r"""
 
 @add_start_docstrings(
     """The vision model from AIMv2 without any head or projection on top.""",
-    AIMV2_START_DOCSTRING,
+    AIMV2_VISION_START_DOCSTRING,
 )
 class AIMv2VisionModel(AIMv2PreTrainedModel):
     main_input_name = "pixel_values"
@@ -699,6 +699,21 @@ def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
     return normed_tensor
 
 
+AIMV2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`AIMv2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
 AIMV2_TEXT_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index ba0db0eb725..3190a84775a 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -528,7 +528,7 @@ class AIMv2AttentionPoolingHead(nn.Module):
         return output
 
 
-AIMV2_START_DOCSTRING = r"""
+AIMV2_VISION_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -538,7 +538,7 @@ AIMV2_START_DOCSTRING = r"""
     and behavior.
 
     Parameters:
-        config ([`AIMv2Config`]): Model configuration class with all the parameters of the model.
+        config ([`AIMv2VisionConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -547,7 +547,7 @@ AIMV2_VISION_INPUTS_DOCSTRING = r"""
     Args:
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`AIMv2ImageProcessor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -594,7 +594,7 @@ class AIMv2PreTrainedModel(PreTrainedModel):
 
 @add_start_docstrings(
     """The vision model from AIMv2 without any head or projection on top.""",
-    AIMV2_START_DOCSTRING,
+    AIMV2_VISION_START_DOCSTRING,
 )
 class AIMv2VisionModel(AIMv2PreTrainedModel):
     main_input_name = "pixel_values"
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index c97d0102323..70eaf5b9730 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -46,7 +46,6 @@ logger = logging.get_logger(__name__)
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
         ("aimv2", "CLIPProcessor"),
-        ("aimv2_vision_model", "CLIPProcessor"),
         ("align", "AlignProcessor"),
         ("altclip", "AltCLIPProcessor"),
         ("aria", "AriaProcessor"),

From cae9b80297a2760ce33c3ca616a6c46056003e8a Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Thu, 24 Apr 2025 05:29:22 +0530
Subject: [PATCH 48/62] nit

---
 src/transformers/models/aimv2/modeling_aimv2.py | 2 ++
 src/transformers/models/aimv2/modular_aimv2.py  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 454cac875df..62584adec1e 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -537,6 +537,8 @@ AIMV2_VISION_INPUTS_DOCSTRING = r"""
             more detail.
         interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
             Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
 
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 3190a84775a..08dab108fe2 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -556,6 +556,8 @@ AIMV2_VISION_INPUTS_DOCSTRING = r"""
             more detail.
         interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
             Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
 

From b7802deefc74d016305c5dbfa53c130759880a81 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 30 Apr 2025 21:10:52 +0530
Subject: [PATCH 49/62] =?UTF-8?q?Use=20more=20Modular=20=F0=9F=98=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/aimv2/configuration_aimv2.py       |  20 +--
 .../convert_aimv2_original_pytorch_to_hf.py   |  31 +++--
 .../models/aimv2/modeling_aimv2.py            | 105 ++++++---------
 .../models/aimv2/modular_aimv2.py             | 127 +++---------------
 4 files changed, 80 insertions(+), 203 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index 5810880823e..eec2382cdfd 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -55,11 +55,9 @@ class AIMv2VisionConfig(PretrainedConfig):
             The epsilon used by the rms normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        projection_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for projection layer in Attention Module.
         qkv_bias (`bool`, *optional*, defaults to `False`):
             Whether to add a bias to the queries, keys and values.
-        use_bias (`bool`, *optional*, defaults to `False`):
+        mlp_bias (`bool`, *optional*, defaults to `False`):
             Whether to add a bias to the Linear layers or Not.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
@@ -99,9 +97,8 @@ class AIMv2VisionConfig(PretrainedConfig):
         patch_size: int = 14,
         rms_norm_eps: float = 1e-5,
         attention_dropout: float = 0.0,
-        projection_dropout: float = 0.0,
         qkv_bias: bool = False,
-        use_bias: bool = False,
+        mlp_bias: bool = False,
         hidden_act: str = "silu",
         initializer_range: float = 0.02,
         use_head: bool = True,
@@ -122,10 +119,9 @@ class AIMv2VisionConfig(PretrainedConfig):
 
         self.use_head = use_head
         self.initializer_range = initializer_range
-        self.use_bias = use_bias
+        self.mlp_bias = mlp_bias
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
-        self.projection_dropout = projection_dropout
         self.is_native = is_native
 
 
@@ -155,11 +151,9 @@ class AIMv2TextConfig(PretrainedConfig):
             The epsilon used by the rms normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        projection_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for projection layer in Attention Module.
         qkv_bias (`bool`, *optional*, defaults to `False`):
             Whether to add a bias to the queries, keys and values.
-        use_bias (`bool`, *optional*, defaults to `False`):
+        mlp_bias (`bool`, *optional*, defaults to `False`):
             Whether to add a bias to the Linear layers or Not.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
@@ -189,9 +183,8 @@ class AIMv2TextConfig(PretrainedConfig):
         num_attention_heads: int = 6,
         rms_norm_eps: float = 1e-5,
         attention_dropout: float = 0.0,
-        projection_dropout: float = 0.0,
         qkv_bias: bool = False,
-        use_bias: bool = False,
+        mlp_bias: bool = False,
         hidden_act: str = "silu",
         pad_token_id: int = None,
         bos_token_id: int = None,
@@ -212,10 +205,9 @@ class AIMv2TextConfig(PretrainedConfig):
         self.attention_dropout = attention_dropout
 
         self.initializer_range = initializer_range
-        self.use_bias = use_bias
+        self.mlp_bias = mlp_bias
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
-        self.projection_dropout = projection_dropout
 
 
 class AIMv2Config(PretrainedConfig):
diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index d53796bc200..7074b3a2bbe 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -23,7 +23,7 @@ import torch
 from huggingface_hub import snapshot_download
 from safetensors import safe_open
 
-from transformers import AIMv2Config, AIMv2Model, AIMv2VisionConfig, AIMv2VisionModel, AutoProcessor
+from transformers import AIMv2Config, AIMv2Model, AIMv2VisionConfig, AIMv2VisionModel, AutoProcessor, AutoImageProcessor
 
 
 ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL = {
@@ -33,10 +33,10 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL = {
     r"preprocessor.patchifier.norm.weight": r"embeddings.rms_norm.weight",
     # Encoder Layers
     r"trunk.blocks.(\d+).attn.qkv": r"encoder.layers.\1.attention.qkv",
-    r"trunk.blocks.(\d+).attn.proj": r"encoder.layers.\1.attention.proj_out",
-    r"trunk.blocks.(\d+).mlp.fc1": r"encoder.layers.\1.ffn.fc1",
-    r"trunk.blocks.(\d+).mlp.fc2": r"encoder.layers.\1.ffn.fc2",
-    r"trunk.blocks.(\d+).mlp.fc3": r"encoder.layers.\1.ffn.fc3",
+    r"trunk.blocks.(\d+).attn.proj": r"encoder.layers.\1.attention.out_proj",
+    r"trunk.blocks.(\d+).mlp.fc1": r"encoder.layers.\1.ffn.gate_proj",
+    r"trunk.blocks.(\d+).mlp.fc2": r"encoder.layers.\1.ffn.down_proj",
+    r"trunk.blocks.(\d+).mlp.fc3": r"encoder.layers.\1.ffn.up_proj",
     # Normalization Layers
     r"trunk.blocks.(\d+).norm_1": r"encoder.layers.\1.rms_norm1",
     r"trunk.blocks.(\d+).norm_2": r"encoder.layers.\1.rms_norm2",
@@ -51,10 +51,10 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
     r"image_encoder.preprocessor.patchifier.norm.weight": r"vision_model.embeddings.rms_norm.weight",
     # Vision Encoder Layers
     r"image_encoder.trunk.blocks.(\d+).attn.qkv": r"vision_model.encoder.layers.\1.attention.qkv",
-    r"image_encoder.trunk.blocks.(\d+).attn.proj": r"vision_model.encoder.layers.\1.attention.proj_out",
-    r"image_encoder.trunk.blocks.(\d+).mlp.fc1": r"vision_model.encoder.layers.\1.ffn.fc1",
-    r"image_encoder.trunk.blocks.(\d+).mlp.fc2": r"vision_model.encoder.layers.\1.ffn.fc2",
-    r"image_encoder.trunk.blocks.(\d+).mlp.fc3": r"vision_model.encoder.layers.\1.ffn.fc3",
+    r"image_encoder.trunk.blocks.(\d+).attn.proj": r"vision_model.encoder.layers.\1.attention.out_proj",
+    r"image_encoder.trunk.blocks.(\d+).mlp.fc1": r"vision_model.encoder.layers.\1.ffn.gate_proj",
+    r"image_encoder.trunk.blocks.(\d+).mlp.fc2": r"vision_model.encoder.layers.\1.ffn.down_proj",
+    r"image_encoder.trunk.blocks.(\d+).mlp.fc3": r"vision_model.encoder.layers.\1.ffn.up_proj",
     # Normalization Layers
     r"image_encoder.trunk.blocks.(\d+).norm_1": r"vision_model.encoder.layers.\1.rms_norm1",
     r"image_encoder.trunk.blocks.(\d+).norm_2": r"vision_model.encoder.layers.\1.rms_norm2",
@@ -70,10 +70,10 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
     r"text_encoder.preprocessor.positional_embedding": r"text_model.embeddings.position_embedding.weight",
     # Text Encoder Layers
     r"text_encoder.trunk.blocks.(\d+).attn.qkv": r"text_model.encoder.layers.\1.attention.qkv",
-    r"text_encoder.trunk.blocks.(\d+).attn.proj": r"text_model.encoder.layers.\1.attention.proj_out",
-    r"text_encoder.trunk.blocks.(\d+).mlp.fc1": r"text_model.encoder.layers.\1.ffn.fc1",
-    r"text_encoder.trunk.blocks.(\d+).mlp.fc2": r"text_model.encoder.layers.\1.ffn.fc2",
-    r"text_encoder.trunk.blocks.(\d+).mlp.fc3": r"text_model.encoder.layers.\1.ffn.fc3",
+    r"text_encoder.trunk.blocks.(\d+).attn.proj": r"text_model.encoder.layers.\1.attention.out_proj",
+    r"text_encoder.trunk.blocks.(\d+).mlp.fc1": r"text_model.encoder.layers.\1.ffn.gate_proj",
+    r"text_encoder.trunk.blocks.(\d+).mlp.fc2": r"text_model.encoder.layers.\1.ffn.down_proj",
+    r"text_encoder.trunk.blocks.(\d+).mlp.fc3": r"text_model.encoder.layers.\1.ffn.up_proj",
     # Text Normalization Layers
     r"text_encoder.trunk.blocks.(\d+).norm_1": r"text_model.encoder.layers.\1.rms_norm1",
     r"text_encoder.trunk.blocks.(\d+).norm_2": r"text_model.encoder.layers.\1.rms_norm2",
@@ -206,7 +206,10 @@ def write_model(
 
 
 def write_image_processor(hf_repo_id: str, output_dir: str):
-    image_processor = AutoProcessor.from_pretrained(hf_repo_id, use_fast=True)
+    if hf_repo_id == "apple/aimv2-large-patch14-224-lit":
+        image_processor = AutoProcessor.from_pretrained(hf_repo_id, use_fast=True)
+    else:
+        image_processor = AutoImageProcessor.from_pretrained(hf_repo_id, use_fast=True)
     image_processor.save_pretrained(output_dir)
     return image_processor
 
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 62584adec1e..331eea0d78b 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -30,12 +30,13 @@ from torch import nn
 
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_outputs import BaseModelOutputWithPooling
-from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.modeling_utils import PreTrainedModel
 
 from ...activations import ACT2FN
 from ...integrations import use_kernel_forward_from_hub
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
@@ -108,23 +109,20 @@ class AIMv2RMSNorm(nn.Module):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
 
 
-class AIMv2SwiGLUFFN(nn.Module):
-    def __init__(self, config: AIMv2VisionConfig):
+class AIMv2MLP(nn.Module):
+    def __init__(self, config):
         super().__init__()
-        in_features = config.hidden_size
-        out_features = config.intermediate_size
-        self.act_fn = config.hidden_act
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
 
-        self.fc1 = nn.Linear(in_features, out_features, bias=config.use_bias)
-        self.fc2 = nn.Linear(out_features, in_features, bias=config.use_bias)
-        self.fc3 = nn.Linear(in_features, out_features, bias=config.use_bias)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        fc3_out = self.fc3(hidden_states)
-        fc1_out = self.fc1(hidden_states)
-        hidden_states = ACT2FN[self.act_fn](fc1_out) * fc3_out
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
 
 
 class AIMv2VisionEmbeddings(nn.Module):
@@ -219,18 +217,6 @@ class AIMv2TextEmbeddings(nn.Module):
         return embeddings
 
 
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
 def eager_attention_forward(
     module: nn.Module,
     query: torch.Tensor,
@@ -241,17 +227,14 @@ def eager_attention_forward(
     dropout: float = 0.0,
     **kwargs,
 ):
-    key_states = repeat_kv(key, module.num_key_value_groups)
-    value_states = repeat_kv(value, module.num_key_value_groups)
-
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
     if attention_mask is not None:
-        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
+        attn_weights = attn_weights + attention_mask
 
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-    attn_output = torch.matmul(attn_weights, value_states)
+
+    attn_output = torch.matmul(attn_weights, value)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
     return attn_output, attn_weights
@@ -265,43 +248,37 @@ class AIMv2Attention(nn.Module):
         self.config = config
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
-        self.attention_dropout = config.attention_dropout
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                 f" {self.num_heads})."
             )
-
-        self.num_key_value_groups = 1
-        self.scaling = self.head_dim**-0.5
-
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
-        self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
-        self.proj_drop = nn.Dropout(config.projection_dropout)
-
-        self.is_causal = False
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
-        batch_size, q_len, _ = hidden_states.size()
+        batch_size, seq_length, embed_dim = hidden_states.shape
 
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
 
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
 
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
@@ -315,31 +292,29 @@ class AIMv2Attention(nn.Module):
 
         attn_output, attn_weights = attention_interface(
             self,
-            query_states,
-            key_states,
-            value_states,
+            queries,
+            keys,
+            values,
             attention_mask,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scaling,
             is_causal=self.is_causal,
-            **kwargs,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
         )
 
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
 
-        attn_output = self.proj_out(attn_output)
-        attn_output = self.proj_drop(attn_output)
+        if not output_attentions:
+            attn_weights = None
 
-        output = (attn_output, attn_weights) if output_attentions else (attn_output, None)
-
-        return output
+        return attn_output, attn_weights
 
 
 class AIMv2EncoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         self.attention = AIMv2Attention(config)
-        self.ffn = AIMv2SwiGLUFFN(config)
+        self.ffn = AIMv2MLP(config)
         self.rms_norm1 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
         self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 08dab108fe2..40de682b3ea 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -16,7 +16,7 @@
 """Pytorch implementation of AIMv2 Model"""
 
 import math
-from typing import Callable, Optional, Tuple
+from typing import Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -24,9 +24,8 @@ from torch import nn
 
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_outputs import BaseModelOutputWithPooling
-from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.modeling_utils import PreTrainedModel
 
-from ...activations import ACT2FN
 from ...modeling_layers import GradientCheckpointingLayer
 from ...utils import (
     add_start_docstrings,
@@ -36,10 +35,10 @@ from ...utils import (
     replace_return_docstrings,
 )
 from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm
-from ..llama.modeling_llama import LlamaRMSNorm, eager_attention_forward
+from ..llama.modeling_llama import LlamaMLP, LlamaRMSNorm
 from ..siglip.configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
-from ..siglip.modeling_siglip import SiglipEncoder, SiglipOutput
-
+from ..siglip.modeling_siglip import SiglipAttention, SiglipEncoder, SiglipOutput
+from ..altclip.modeling_altclip import AltCLIPModel
 
 logger = logging.get_logger(__name__)
 
@@ -73,11 +72,9 @@ class AIMv2VisionConfig(SiglipVisionConfig):
             The epsilon used by the rms normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        projection_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for projection layer in Attention Module.
         qkv_bias (`bool`, *optional*, defaults to `False`):
             Whether to add a bias to the queries, keys and values.
-        use_bias (`bool`, *optional*, defaults to `False`):
+        mlp_bias (`bool`, *optional*, defaults to `False`):
             Whether to add a bias to the Linear layers or Not.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
@@ -114,9 +111,8 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         patch_size: int = 14,
         rms_norm_eps: float = 1e-5,
         attention_dropout: float = 0.0,
-        projection_dropout: float = 0.0,
         qkv_bias: bool = False,
-        use_bias: bool = False,
+        mlp_bias: bool = False,
         hidden_act: str = "silu",
         initializer_range: float = 0.02,
         use_head: bool = True,
@@ -139,10 +135,9 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         self.use_head = use_head
         self.initializer_range = initializer_range
         self.attention_dropout = attention_dropout
-        self.use_bias = use_bias
+        self.mlp_bias = mlp_bias
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
-        self.projection_dropout = projection_dropout
         self.is_native = is_native
 
         del self.layer_norm_eps
@@ -174,11 +169,9 @@ class AIMv2TextConfig(SiglipTextConfig):
             The epsilon used by the rms normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        projection_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for projection layer in Attention Module.
         qkv_bias (`bool`, *optional*, defaults to `False`):
             Whether to add a bias to the queries, keys and values.
-        use_bias (`bool`, *optional*, defaults to `False`):
+        mlp_bias (`bool`, *optional*, defaults to `False`):
             Whether to add a bias to the Linear layers or Not.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
@@ -205,9 +198,8 @@ class AIMv2TextConfig(SiglipTextConfig):
         num_attention_heads: int = 6,
         rms_norm_eps: float = 1e-5,
         attention_dropout: float = 0.0,
-        projection_dropout: float = 0.0,
         qkv_bias: bool = False,
-        use_bias: bool = False,
+        mlp_bias: bool = False,
         hidden_act: str = "silu",
         pad_token_id: int = None,
         bos_token_id: int = None,
@@ -232,10 +224,9 @@ class AIMv2TextConfig(SiglipTextConfig):
 
         self.initializer_range = initializer_range
         self.attention_dropout = attention_dropout
-        self.use_bias = use_bias
+        self.mlp_bias = mlp_bias
         self.qkv_bias = qkv_bias
         self.rms_norm_eps = rms_norm_eps
-        self.projection_dropout = projection_dropout
 
         del self.bos_token_id
         del self.pad_token_id
@@ -310,23 +301,8 @@ class AIMv2RMSNorm(LlamaRMSNorm):
     pass
 
 
-class AIMv2SwiGLUFFN(nn.Module):
-    def __init__(self, config: AIMv2VisionConfig):
-        super().__init__()
-        in_features = config.hidden_size
-        out_features = config.intermediate_size
-        self.act_fn = config.hidden_act
-
-        self.fc1 = nn.Linear(in_features, out_features, bias=config.use_bias)
-        self.fc2 = nn.Linear(out_features, in_features, bias=config.use_bias)
-        self.fc3 = nn.Linear(in_features, out_features, bias=config.use_bias)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        fc3_out = self.fc3(hidden_states)
-        fc1_out = self.fc1(hidden_states)
-        hidden_states = ACT2FN[self.act_fn](fc1_out) * fc3_out
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
+class AIMv2MLP(LlamaMLP):
+    pass
 
 
 class AIMv2VisionEmbeddings(nn.Module):
@@ -385,89 +361,20 @@ class AIMv2TextEmbeddings(CLIPTextEmbeddings):
     pass
 
 
-class AIMv2Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
+class AIMv2Attention(SiglipAttention):
     def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.attention_dropout = config.attention_dropout
-        self.head_dim = self.embed_dim // self.num_heads
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-
-        self.num_key_value_groups = 1
-        self.scaling = self.head_dim**-0.5
-
+        super().__init__(config)
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
-        self.proj_out = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
-        self.proj_drop = nn.Dropout(config.projection_dropout)
-
-        self.is_causal = False
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        """Input shape: Batch x Time x Channel"""
-
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            if self.config._attn_implementation == "sdpa" and output_attentions:
-                logger.warning_once(
-                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
-                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-                )
-            else:
-                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scaling,
-            is_causal=self.is_causal,
-            **kwargs,
-        )
-
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
-        attn_output = self.proj_out(attn_output)
-        attn_output = self.proj_drop(attn_output)
-
-        output = (attn_output, attn_weights) if output_attentions else (attn_output, None)
-
-        return output
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
 
 
 class AIMv2EncoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: AIMv2VisionConfig):
         super().__init__()
         self.attention = AIMv2Attention(config)
-        self.ffn = AIMv2SwiGLUFFN(config)
+        self.ffn = AIMv2MLP(config)
         self.rms_norm1 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
         self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 

From cfd018685f861a39020e19baf97f49ca32c662e8 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 30 Apr 2025 21:29:41 +0530
Subject: [PATCH 50/62] Change name from AIMv2 to Aimv2

---
 docs/source/en/model_doc/aimv2.md             |  24 +--
 .../models/aimv2/configuration_aimv2.py       |  64 ++++----
 .../convert_aimv2_original_pytorch_to_hf.py   |  13 +-
 .../models/aimv2/modeling_aimv2.py            | 132 ++++++++--------
 .../models/aimv2/modular_aimv2.py             | 148 +++++++++---------
 .../models/auto/configuration_auto.py         |   6 +-
 src/transformers/models/auto/modeling_auto.py |   6 +-
 tests/models/aimv2/test_modeling_aimv2.py     | 123 ++++++++++++---
 utils/check_repo.py                           |   2 +-
 9 files changed, 299 insertions(+), 219 deletions(-)

diff --git a/docs/source/en/model_doc/aimv2.md b/docs/source/en/model_doc/aimv2.md
index 7db1c291bc4..1c05c5068d7 100644
--- a/docs/source/en/model_doc/aimv2.md
+++ b/docs/source/en/model_doc/aimv2.md
@@ -73,31 +73,31 @@ outputs = model(**inputs)
 probs = outputs.logits_per_image.softmax(dim=-1)
 ```
 
-## AIMv2Config
+## Aimv2Config
 
-[[autodoc]] AIMv2Config
+[[autodoc]] Aimv2Config
 
-## AIMv2TextConfig
+## Aimv2TextConfig
 
-[[autodoc]] AIMv2TextConfig
+[[autodoc]] Aimv2TextConfig
 
-## AIMv2VisionConfig
+## Aimv2VisionConfig
 
-[[autodoc]] AIMv2VisionConfig
+[[autodoc]] Aimv2VisionConfig
 
-## AIMv2Model
+## Aimv2Model
 
-[[autodoc]] AIMv2Model
+[[autodoc]] Aimv2Model
     - forward
 
-## AIMv2VisionModel
+## Aimv2VisionModel
 
-[[autodoc]] AIMv2VisionModel
+[[autodoc]] Aimv2VisionModel
     - forward
 
-## AIMv2TextModel
+## Aimv2TextModel
 
-[[autodoc]] AIMv2TextModel
+[[autodoc]] Aimv2TextModel
     - forward
 
 </pt>
diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index eec2382cdfd..ec7a643da6e 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -26,9 +26,9 @@ from ...utils import logging
 logger = logging.get_logger(__name__)
 
 
-class AIMv2VisionConfig(PretrainedConfig):
+class Aimv2VisionConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
+    This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
     AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
     [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
@@ -73,11 +73,11 @@ class AIMv2VisionConfig(PretrainedConfig):
     ```python
     >>> from transformers import SiglipVisionConfig, SiglipVisionModel
 
-    >>> # Initializing a AIMv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
-    >>> configuration = AIMv2VisionConfig()
+    >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
+    >>> configuration = Aimv2VisionConfig()
 
-    >>> # Initializing a AIMv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
-    >>> model = AIMv2VisionModel(configuration)
+    >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
+    >>> model = Aimv2VisionModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -125,9 +125,9 @@ class AIMv2VisionConfig(PretrainedConfig):
         self.is_native = is_native
 
 
-class AIMv2TextConfig(PretrainedConfig):
+class Aimv2TextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a
+    This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
     AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
     [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.
@@ -138,7 +138,7 @@ class AIMv2TextConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 49408):
             Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`AIMv2Model`].
+            the `inputs_ids` passed when calling [`Aimv2Model`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 2048):
@@ -210,9 +210,9 @@ class AIMv2TextConfig(PretrainedConfig):
         self.rms_norm_eps = rms_norm_eps
 
 
-class AIMv2Config(PretrainedConfig):
+class Aimv2Config(PretrainedConfig):
     r"""
-    [`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to
+    [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
     instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
     [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.
@@ -222,9 +222,9 @@ class AIMv2Config(PretrainedConfig):
 
     Args:
         text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`AIMv2TextConfig`].
+            Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
         vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`AIMv2VisionConfig`].
+            Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
             Dimensionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
@@ -235,29 +235,29 @@ class AIMv2Config(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import AIMv2Config, AIMv2Model
+    >>> from transformers import Aimv2Config, Aimv2Model
 
-    >>> # Initializing a AIMv2Config with apple/aimv2-large-patch14-224-lit style configuration
-    >>> configuration = AIMv2Config()
+    >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
+    >>> configuration = Aimv2Config()
 
-    >>> # Initializing a AIMv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
-    >>> model = AIMv2Model(configuration)
+    >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
+    >>> model = Aimv2Model(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
 
-    >>> # We can also initialize a AIMv2Config from a AIMv2TextConfig and a AIMv2VisionConfig
-    >>> from transformers import AIMv2TextConfig, AIMv2VisionConfig
+    >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
+    >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig
 
     >>> # Initializing a AIMv2Text and AIMv2Vision configuration
-    >>> config_text = AIMv2TextConfig()
-    >>> config_vision = AIMv2VisionConfig()
+    >>> config_text = Aimv2TextConfig()
+    >>> config_vision = Aimv2VisionConfig()
 
-    >>> config = AIMv2Config(text_config=config_text, vision_config=config_vision)
+    >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
     ```"""
 
     model_type = "aimv2"
-    sub_configs = {"text_config": AIMv2TextConfig, "vision_config": AIMv2VisionConfig}
+    sub_configs = {"text_config": Aimv2TextConfig, "vision_config": Aimv2VisionConfig}
 
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
@@ -266,29 +266,29 @@ class AIMv2Config(PretrainedConfig):
 
         if text_config is None:
             text_config = {}
-            logger.info("`text_config` is `None`. Initializing the `AIMv2TextConfig` with default values.")
+            logger.info("`text_config` is `None`. Initializing the `Aimv2TextConfig` with default values.")
 
         if vision_config is None:
             vision_config = {}
-            logger.info("`vision_config` is `None`. initializing the `AIMv2VisionConfig` with default values.")
+            logger.info("`vision_config` is `None`. initializing the `Aimv2VisionConfig` with default values.")
 
-        self.text_config = AIMv2TextConfig(**text_config)
-        self.vision_config = AIMv2VisionConfig(**vision_config)
+        self.text_config = Aimv2TextConfig(**text_config)
+        self.vision_config = Aimv2VisionConfig(**vision_config)
         self.projection_dim = projection_dim
         self.logit_scale_init_value = logit_scale_init_value
         self.max_logit_scale = 100.0
 
     @classmethod
-    def from_text_vision_configs(cls, text_config: AIMv2TextConfig, vision_config: AIMv2VisionConfig, **kwargs):
+    def from_text_vision_configs(cls, text_config: Aimv2TextConfig, vision_config: Aimv2VisionConfig, **kwargs):
         r"""
-        Instantiate a [`AIMv2Config`] (or a derived class) from aimv2 text model configuration and aimv2 vision
+        Instantiate a [`Aimv2Config`] (or a derived class) from aimv2 text model configuration and aimv2 vision
         model configuration.
 
         Returns:
-            [`AIMv2Config`]: An instance of a configuration object
+            [`Aimv2Config`]: An instance of a configuration object
         """
 
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
 
 
-__all__ = ["AIMv2Config", "AIMv2VisionConfig", "AIMv2TextConfig"]
+__all__ = ["Aimv2Config", "Aimv2VisionConfig", "Aimv2TextConfig"]
diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index 7074b3a2bbe..17c419074d0 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -23,7 +23,14 @@ import torch
 from huggingface_hub import snapshot_download
 from safetensors import safe_open
 
-from transformers import AIMv2Config, AIMv2Model, AIMv2VisionConfig, AIMv2VisionModel, AutoProcessor, AutoImageProcessor
+from transformers import (
+    Aimv2Config,
+    Aimv2Model,
+    Aimv2VisionConfig,
+    Aimv2VisionModel,
+    AutoImageProcessor,
+    AutoProcessor,
+)
 
 
 ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL = {
@@ -131,9 +138,9 @@ def get_model_config_mapping(model_id: str):
     """Determines the correct model, config, and key mappings based on the checkpoint name."""
 
     if model_id == "apple/aimv2-large-patch14-224-lit":
-        return AIMv2Model, AIMv2Config, ORIGINAL_TO_CONVERTED_KEY_MAPPING
+        return Aimv2Model, Aimv2Config, ORIGINAL_TO_CONVERTED_KEY_MAPPING
     else:
-        return AIMv2VisionModel, AIMv2VisionConfig, ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL
+        return Aimv2VisionModel, Aimv2VisionConfig, ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION_MODEL
 
 
 def write_model(
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 331eea0d78b..8f6f1199eaa 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -45,14 +45,14 @@ from ...utils import (
     logging,
     replace_return_docstrings,
 )
-from .configuration_aimv2 import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
+from .configuration_aimv2 import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig
 
 
 logger = logging.get_logger(__name__)
 
 
 @dataclass
-class AIMv2Output(ModelOutput):
+class Aimv2Output(ModelOutput):
     """
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
@@ -64,13 +64,13 @@ class AIMv2Output(ModelOutput):
             The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
             similarity scores.
         text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`AIMv2TextModel`].
+            The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
         image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`AIMv2VisionModel`].
+            The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
         text_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`AIMv2TextModel`].
+            The output of the [`Aimv2TextModel`].
         vision_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`AIMv2VisionModel`].
+            The output of the [`Aimv2VisionModel`].
     """
 
     loss: Optional[torch.FloatTensor] = None
@@ -89,10 +89,10 @@ class AIMv2Output(ModelOutput):
 
 
 @use_kernel_forward_from_hub("RMSNorm")
-class AIMv2RMSNorm(nn.Module):
+class Aimv2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        AIMv2RMSNorm is equivalent to T5LayerNorm
+        Aimv2RMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -109,7 +109,7 @@ class AIMv2RMSNorm(nn.Module):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
 
 
-class AIMv2MLP(nn.Module):
+class Aimv2MLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -125,15 +125,15 @@ class AIMv2MLP(nn.Module):
         return down_proj
 
 
-class AIMv2VisionEmbeddings(nn.Module):
-    def __init__(self, config: AIMv2VisionConfig):
+class Aimv2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Aimv2VisionConfig):
         super().__init__()
         self.config = config
         self.patch_size = config.patch_size
         self.patch_embed = nn.Conv2d(
             config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
         )
-        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         num_patches = (config.image_size // config.patch_size) ** 2
         if not self.config.is_native:
@@ -177,8 +177,8 @@ class AIMv2VisionEmbeddings(nn.Module):
         return hidden_states
 
 
-class AIMv2TextEmbeddings(nn.Module):
-    def __init__(self, config: AIMv2TextConfig):
+class Aimv2TextEmbeddings(nn.Module):
+    def __init__(self, config: Aimv2TextConfig):
         super().__init__()
         embed_dim = config.hidden_size
 
@@ -240,7 +240,7 @@ def eager_attention_forward(
     return attn_output, attn_weights
 
 
-class AIMv2Attention(nn.Module):
+class Aimv2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(self, config):
@@ -310,13 +310,13 @@ class AIMv2Attention(nn.Module):
         return attn_output, attn_weights
 
 
-class AIMv2EncoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: AIMv2VisionConfig):
+class Aimv2EncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Aimv2VisionConfig):
         super().__init__()
-        self.attention = AIMv2Attention(config)
-        self.ffn = AIMv2MLP(config)
-        self.rms_norm1 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
-        self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.attention = Aimv2Attention(config)
+        self.ffn = Aimv2MLP(config)
+        self.rms_norm1 = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.rms_norm2 = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
     def forward(
         self,
@@ -337,19 +337,19 @@ class AIMv2EncoderLayer(GradientCheckpointingLayer):
         return (hidden_states, attn_weights) if output_attentions else (hidden_states, None)
 
 
-class AIMv2Encoder(nn.Module):
+class Aimv2Encoder(nn.Module):
     """
     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`AIMv2EncoderLayer`].
+    [`Aimv2EncoderLayer`].
 
     Args:
-        config: AIMv2Config
+        config: Aimv2Config
     """
 
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: Aimv2Config):
         super().__init__()
         self.config = config
-        self.layers = nn.ModuleList([AIMv2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layers = nn.ModuleList([Aimv2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
     # Ignore copy
@@ -417,8 +417,8 @@ class AIMv2Encoder(nn.Module):
         )
 
 
-class AIMv2AttentionPoolingHead(nn.Module):
-    def __init__(self, config: AIMv2VisionConfig):
+class Aimv2AttentionPoolingHead(nn.Module):
+    def __init__(self, config: Aimv2VisionConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -451,13 +451,13 @@ class AIMv2AttentionPoolingHead(nn.Module):
         return output
 
 
-class AIMv2PreTrainedModel(PreTrainedModel):
+class Aimv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models. The model is only intended for inference and doesn't support finetuning.
     """
 
-    config_class = AIMv2Config
+    config_class = Aimv2Config
     base_model_prefix = "aimv2"
     supports_gradient_checkpointing = True
     _no_split_modules = ["AIMv2SwiGLUFFN"]
@@ -473,14 +473,14 @@ class AIMv2PreTrainedModel(PreTrainedModel):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, AIMv2RMSNorm):
+        elif isinstance(module, Aimv2RMSNorm):
             module.weight.data.fill_(1.0)
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
         elif hasattr(module, "logit_scale"):
             if isinstance(module.logit_scale, nn.Parameter):
                 module.logit_scale.data.fill_(math.log(1 / 0.07))
-        elif isinstance(module, AIMv2AttentionPoolingHead):
+        elif isinstance(module, Aimv2AttentionPoolingHead):
             module.cls_token.data.normal_(mean=0.0, std=std)
 
 
@@ -494,7 +494,7 @@ AIMV2_VISION_START_DOCSTRING = r"""
     and behavior.
 
     Parameters:
-        config ([`AIMv2VisionConfig`]): Model configuration class with all the parameters of the model.
+        config ([`Aimv2VisionConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -521,20 +521,20 @@ AIMV2_VISION_INPUTS_DOCSTRING = r"""
     """The vision model from AIMv2 without any head or projection on top.""",
     AIMV2_VISION_START_DOCSTRING,
 )
-class AIMv2VisionModel(AIMv2PreTrainedModel):
+class Aimv2VisionModel(Aimv2PreTrainedModel):
     main_input_name = "pixel_values"
 
-    def __init__(self, config: AIMv2VisionConfig):
+    def __init__(self, config: Aimv2VisionConfig):
         super().__init__(config)
         self.config = config
-        self.embeddings = AIMv2VisionEmbeddings(config)
-        self.encoder = AIMv2Encoder(config)
+        self.embeddings = Aimv2VisionEmbeddings(config)
+        self.encoder = Aimv2Encoder(config)
         # The only change from SiglipVisionTransformer is, layernorm -> rms_norm.
-        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         self.use_head = config.use_head
         if self.use_head:
-            self.head = AIMv2AttentionPoolingHead(config)
+            self.head = Aimv2AttentionPoolingHead(config)
 
         self.post_init()
 
@@ -543,7 +543,7 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
 
     @can_return_tuple
     @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AIMv2VisionConfig)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Aimv2VisionConfig)
     def forward(
         self,
         pixel_values,
@@ -561,7 +561,7 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         >>> import requests
         >>> from transformers import AutoProcessor, Siglip2VisionModel
 
-        >>> model = AIMv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
+        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
         >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -599,15 +599,15 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         )
 
 
-class AIMv2TextModel(AIMv2PreTrainedModel):
+class Aimv2TextModel(Aimv2PreTrainedModel):
     main_input_name = "input_ids"
 
-    def __init__(self, config: AIMv2TextConfig):
+    def __init__(self, config: Aimv2TextConfig):
         super().__init__(config)
         self.config = config
-        self.embeddings = AIMv2TextEmbeddings(config)
-        self.encoder = AIMv2Encoder(config)
-        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.embeddings = Aimv2TextEmbeddings(config)
+        self.encoder = Aimv2Encoder(config)
+        self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         self.eos_token_id = config.eos_token_id
 
@@ -686,7 +686,7 @@ AIMV2_START_DOCSTRING = r"""
     and behavior.
 
     Parameters:
-        config ([`AIMv2Config`]): Model configuration class with all the parameters of the model.
+        config ([`Aimv2Config`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -747,7 +747,7 @@ AIMV2_INPUTS_DOCSTRING = r"""
             [What are position IDs?](../glossary#position-ids)
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`AIMv2ImageProcessor.__call__`] for details.
+            [`AutoImageProcessor`]. See [`Aimv2ImageProcessor.__call__`] for details.
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
         output_attentions (`bool`, *optional*):
@@ -764,19 +764,19 @@ AIMV2_INPUTS_DOCSTRING = r"""
 
 
 @add_start_docstrings(AIMV2_START_DOCSTRING)
-class AIMv2Model(AIMv2PreTrainedModel):
-    config_class = AIMv2Config
-    _no_split_modules = ["AIMv2TextEmbeddings", "AIMv2EncoderLayer", "AIMv2VisionEmbeddings"]
+class Aimv2Model(Aimv2PreTrainedModel):
+    config_class = Aimv2Config
+    _no_split_modules = ["Aimv2TextEmbeddings", "Aimv2EncoderLayer", "Aimv2VisionEmbeddings"]
 
-    def __init__(self, config: AIMv2Config):
+    def __init__(self, config: Aimv2Config):
         super().__init__(config)
 
         self.projection_dim = config.projection_dim
         self.vision_embed_dim = config.vision_config.hidden_size
         self.text_embed_dim = config.text_config.hidden_size
 
-        self.vision_model = AIMv2VisionModel._from_config(config.vision_config)
-        self.text_model = AIMv2TextModel._from_config(config.text_config)
+        self.vision_model = Aimv2VisionModel._from_config(config.vision_config)
+        self.text_model = Aimv2TextModel._from_config(config.text_config)
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
@@ -798,14 +798,14 @@ class AIMv2Model(AIMv2PreTrainedModel):
         r"""
         Returns:
             text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`AIMv2TextModel`].
+            applying the projection layer to the pooled output of [`Aimv2TextModel`].
 
         Examples:
 
         ```python
-        >>> from transformers import AutoTokenizer, AIMv2Model
+        >>> from transformers import AutoTokenizer, Aimv2Model
 
-        >>> model = AIMv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
+        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
         >>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")
 
         >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
@@ -841,16 +841,16 @@ class AIMv2Model(AIMv2PreTrainedModel):
         r"""
         Returns:
             image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
-            applying the projection layer to the pooled output of [`AIMv2VisionModel`].
+            applying the projection layer to the pooled output of [`Aimv2VisionModel`].
 
         Examples:
 
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import AutoProcessor, AIMv2Model
+        >>> from transformers import AutoProcessor, Aimv2Model
 
-        >>> model = AIMv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
+        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
         >>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -880,7 +880,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
 
     @can_return_tuple
     @add_start_docstrings_to_model_forward(AIMV2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=AIMv2Output, config_class=AIMv2Config)
+    @replace_return_docstrings(output_type=Aimv2Output, config_class=Aimv2Config)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -889,7 +889,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> AIMv2Output:
+    ) -> Aimv2Output:
         r"""
         Returns:
 
@@ -898,9 +898,9 @@ class AIMv2Model(AIMv2PreTrainedModel):
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import AutoProcessor, AIMv2Model
+        >>> from transformers import AutoProcessor, Aimv2Model
 
-        >>> model = AIMv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
+        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
         >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -947,7 +947,7 @@ class AIMv2Model(AIMv2PreTrainedModel):
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
-        return AIMv2Output(
+        return Aimv2Output(
             logits_per_image=logits_per_image,
             logits_per_text=logits_per_text,
             text_embeds=text_embeds,
@@ -957,4 +957,4 @@ class AIMv2Model(AIMv2PreTrainedModel):
         )
 
 
-__all__ = ["AIMv2VisionModel", "AIMv2Model", "AIMv2PreTrainedModel", "AIMv2TextModel"]
+__all__ = ["Aimv2VisionModel", "Aimv2Model", "Aimv2PreTrainedModel", "Aimv2TextModel"]
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 40de682b3ea..52f5f67d9e8 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -38,14 +38,14 @@ from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm
 from ..llama.modeling_llama import LlamaMLP, LlamaRMSNorm
 from ..siglip.configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
 from ..siglip.modeling_siglip import SiglipAttention, SiglipEncoder, SiglipOutput
-from ..altclip.modeling_altclip import AltCLIPModel
+
 
 logger = logging.get_logger(__name__)
 
 
-class AIMv2VisionConfig(SiglipVisionConfig):
+class Aimv2VisionConfig(SiglipVisionConfig):
     r"""
-    This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
+    This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
     AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
     [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
@@ -90,11 +90,11 @@ class AIMv2VisionConfig(SiglipVisionConfig):
     ```python
     >>> from transformers import SiglipVisionConfig, SiglipVisionModel
 
-    >>> # Initializing a AIMv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
-    >>> configuration = AIMv2VisionConfig()
+    >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
+    >>> configuration = Aimv2VisionConfig()
 
-    >>> # Initializing a AIMv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
-    >>> model = AIMv2VisionModel(configuration)
+    >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
+    >>> model = Aimv2VisionModel(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
@@ -143,9 +143,9 @@ class AIMv2VisionConfig(SiglipVisionConfig):
         del self.layer_norm_eps
 
 
-class AIMv2TextConfig(SiglipTextConfig):
+class Aimv2TextConfig(SiglipTextConfig):
     r"""
-    This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a
+    This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
     AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
     [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.
@@ -156,7 +156,7 @@ class AIMv2TextConfig(SiglipTextConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 49408):
             Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
-            the `inputs_ids` passed when calling [`AIMv2Model`].
+            the `inputs_ids` passed when calling [`Aimv2Model`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 2048):
@@ -234,9 +234,9 @@ class AIMv2TextConfig(SiglipTextConfig):
         del self.layer_norm_eps
 
 
-class AIMv2Config(SiglipConfig):
+class Aimv2Config(SiglipConfig):
     r"""
-    [`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to
+    [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
     instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
     [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.
@@ -246,9 +246,9 @@ class AIMv2Config(SiglipConfig):
 
     Args:
         text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`AIMv2TextConfig`].
+            Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
         vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`AIMv2VisionConfig`].
+            Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
             Dimensionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
@@ -259,25 +259,25 @@ class AIMv2Config(SiglipConfig):
     Example:
 
     ```python
-    >>> from transformers import AIMv2Config, AIMv2Model
+    >>> from transformers import Aimv2Config, Aimv2Model
 
-    >>> # Initializing a AIMv2Config with apple/aimv2-large-patch14-224-lit style configuration
-    >>> configuration = AIMv2Config()
+    >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
+    >>> configuration = Aimv2Config()
 
-    >>> # Initializing a AIMv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
-    >>> model = AIMv2Model(configuration)
+    >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
+    >>> model = Aimv2Model(configuration)
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
 
-    >>> # We can also initialize a AIMv2Config from a AIMv2TextConfig and a AIMv2VisionConfig
-    >>> from transformers import AIMv2TextConfig, AIMv2VisionConfig
+    >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
+    >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig
 
     >>> # Initializing a AIMv2Text and AIMv2Vision configuration
-    >>> config_text = AIMv2TextConfig()
-    >>> config_vision = AIMv2VisionConfig()
+    >>> config_text = Aimv2TextConfig()
+    >>> config_vision = Aimv2VisionConfig()
 
-    >>> config = AIMv2Config(text_config=config_text, vision_config=config_vision)
+    >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
     ```"""
 
     def __init__(
@@ -293,27 +293,27 @@ class AIMv2Config(SiglipConfig):
     pass
 
 
-class AIMv2Output(SiglipOutput):
+class Aimv2Output(SiglipOutput):
     pass
 
 
-class AIMv2RMSNorm(LlamaRMSNorm):
+class Aimv2RMSNorm(LlamaRMSNorm):
     pass
 
 
-class AIMv2MLP(LlamaMLP):
+class Aimv2MLP(LlamaMLP):
     pass
 
 
-class AIMv2VisionEmbeddings(nn.Module):
-    def __init__(self, config: AIMv2VisionConfig):
+class Aimv2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Aimv2VisionConfig):
         super().__init__()
         self.config = config
         self.patch_size = config.patch_size
         self.patch_embed = nn.Conv2d(
             config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
         )
-        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         num_patches = (config.image_size // config.patch_size) ** 2
         if not self.config.is_native:
@@ -357,11 +357,11 @@ class AIMv2VisionEmbeddings(nn.Module):
         return hidden_states
 
 
-class AIMv2TextEmbeddings(CLIPTextEmbeddings):
+class Aimv2TextEmbeddings(CLIPTextEmbeddings):
     pass
 
 
-class AIMv2Attention(SiglipAttention):
+class Aimv2Attention(SiglipAttention):
     def __init__(self, config):
         super().__init__(config)
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
@@ -370,13 +370,13 @@ class AIMv2Attention(SiglipAttention):
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
 
 
-class AIMv2EncoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: AIMv2VisionConfig):
+class Aimv2EncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Aimv2VisionConfig):
         super().__init__()
-        self.attention = AIMv2Attention(config)
-        self.ffn = AIMv2MLP(config)
-        self.rms_norm1 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
-        self.rms_norm2 = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.attention = Aimv2Attention(config)
+        self.ffn = Aimv2MLP(config)
+        self.rms_norm1 = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.rms_norm2 = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
     def forward(
         self,
@@ -397,12 +397,12 @@ class AIMv2EncoderLayer(GradientCheckpointingLayer):
         return (hidden_states, attn_weights) if output_attentions else (hidden_states, None)
 
 
-class AIMv2Encoder(SiglipEncoder):
+class Aimv2Encoder(SiglipEncoder):
     pass
 
 
-class AIMv2AttentionPoolingHead(nn.Module):
-    def __init__(self, config: AIMv2VisionConfig):
+class Aimv2AttentionPoolingHead(nn.Module):
+    def __init__(self, config: Aimv2VisionConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -445,7 +445,7 @@ AIMV2_VISION_START_DOCSTRING = r"""
     and behavior.
 
     Parameters:
-        config ([`AIMv2VisionConfig`]): Model configuration class with all the parameters of the model.
+        config ([`Aimv2VisionConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -468,13 +468,13 @@ AIMV2_VISION_INPUTS_DOCSTRING = r"""
 """
 
 
-class AIMv2PreTrainedModel(PreTrainedModel):
+class Aimv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models. The model is only intended for inference and doesn't support finetuning.
     """
 
-    config_class = AIMv2Config
+    config_class = Aimv2Config
     base_model_prefix = "aimv2"
     supports_gradient_checkpointing = True
     _no_split_modules = ["AIMv2SwiGLUFFN"]
@@ -490,14 +490,14 @@ class AIMv2PreTrainedModel(PreTrainedModel):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, AIMv2RMSNorm):
+        elif isinstance(module, Aimv2RMSNorm):
             module.weight.data.fill_(1.0)
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
         elif hasattr(module, "logit_scale"):
             if isinstance(module.logit_scale, nn.Parameter):
                 module.logit_scale.data.fill_(math.log(1 / 0.07))
-        elif isinstance(module, AIMv2AttentionPoolingHead):
+        elif isinstance(module, Aimv2AttentionPoolingHead):
             module.cls_token.data.normal_(mean=0.0, std=std)
 
 
@@ -505,20 +505,20 @@ class AIMv2PreTrainedModel(PreTrainedModel):
     """The vision model from AIMv2 without any head or projection on top.""",
     AIMV2_VISION_START_DOCSTRING,
 )
-class AIMv2VisionModel(AIMv2PreTrainedModel):
+class Aimv2VisionModel(Aimv2PreTrainedModel):
     main_input_name = "pixel_values"
 
-    def __init__(self, config: AIMv2VisionConfig):
+    def __init__(self, config: Aimv2VisionConfig):
         super().__init__(config)
         self.config = config
-        self.embeddings = AIMv2VisionEmbeddings(config)
-        self.encoder = AIMv2Encoder(config)
+        self.embeddings = Aimv2VisionEmbeddings(config)
+        self.encoder = Aimv2Encoder(config)
         # The only change from SiglipVisionTransformer is, layernorm -> rms_norm.
-        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         self.use_head = config.use_head
         if self.use_head:
-            self.head = AIMv2AttentionPoolingHead(config)
+            self.head = Aimv2AttentionPoolingHead(config)
 
         self.post_init()
 
@@ -527,7 +527,7 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
 
     @can_return_tuple
     @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AIMv2VisionConfig)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Aimv2VisionConfig)
     def forward(
         self,
         pixel_values,
@@ -545,7 +545,7 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         >>> import requests
         >>> from transformers import AutoProcessor, Siglip2VisionModel
 
-        >>> model = AIMv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
+        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
         >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -583,15 +583,15 @@ class AIMv2VisionModel(AIMv2PreTrainedModel):
         )
 
 
-class AIMv2TextModel(AIMv2PreTrainedModel):
+class Aimv2TextModel(Aimv2PreTrainedModel):
     main_input_name = "input_ids"
 
-    def __init__(self, config: AIMv2TextConfig):
+    def __init__(self, config: Aimv2TextConfig):
         super().__init__(config)
         self.config = config
-        self.embeddings = AIMv2TextEmbeddings(config)
-        self.encoder = AIMv2Encoder(config)
-        self.rms_norm = AIMv2RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.embeddings = Aimv2TextEmbeddings(config)
+        self.encoder = Aimv2Encoder(config)
+        self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
 
         self.eos_token_id = config.eos_token_id
 
@@ -649,16 +649,16 @@ class AIMv2TextModel(AIMv2PreTrainedModel):
         )
 
 
-class AIMv2Model(CLIPModel, nn.Module):
-    def __init__(self, config: AIMv2Config):
+class Aimv2Model(CLIPModel, nn.Module):
+    def __init__(self, config: Aimv2Config):
         nn.Module().__init__(config)
 
         self.projection_dim = config.projection_dim
         self.vision_embed_dim = config.vision_config.hidden_size
         self.text_embed_dim = config.text_config.hidden_size
 
-        self.vision_model = AIMv2VisionModel._from_config(config.vision_config)
-        self.text_model = AIMv2TextModel._from_config(config.text_config)
+        self.vision_model = Aimv2VisionModel._from_config(config.vision_config)
+        self.text_model = Aimv2TextModel._from_config(config.text_config)
 
         self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
         self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
@@ -676,14 +676,14 @@ class AIMv2Model(CLIPModel, nn.Module):
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> AIMv2Output:
+    ) -> Aimv2Output:
         r"""
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import AutoProcessor, AIMv2Model
+        >>> from transformers import AutoProcessor, Aimv2Model
 
-        >>> model = AIMv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
+        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
         >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@@ -730,7 +730,7 @@ class AIMv2Model(CLIPModel, nn.Module):
         logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
         logits_per_image = logits_per_text.t()
 
-        return AIMv2Output(
+        return Aimv2Output(
             logits_per_image=logits_per_image,
             logits_per_text=logits_per_text,
             text_embeds=text_embeds,
@@ -741,11 +741,11 @@ class AIMv2Model(CLIPModel, nn.Module):
 
 
 __all__ = [
-    "AIMv2Config",
-    "AIMv2VisionConfig",
-    "AIMv2TextConfig",
-    "AIMv2VisionModel",
-    "AIMv2Model",
-    "AIMv2PreTrainedModel",
-    "AIMv2TextModel",
+    "Aimv2Config",
+    "Aimv2VisionConfig",
+    "Aimv2TextConfig",
+    "Aimv2VisionModel",
+    "Aimv2Model",
+    "Aimv2PreTrainedModel",
+    "Aimv2TextModel",
 ]
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 4f031e05400..843283014f1 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -32,8 +32,8 @@ logger = logging.get_logger(__name__)
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
-        ("aimv2", "AIMv2Config"),
-        ("aimv2_vision_model", "AIMv2VisionConfig"),
+        ("aimv2", "Aimv2Config"),
+        ("aimv2_vision_model", "Aimv2VisionConfig"),
         ("albert", "AlbertConfig"),
         ("align", "AlignConfig"),
         ("altclip", "AltCLIPConfig"),
@@ -384,7 +384,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
         ("aimv2", "AIMv2"),
-        ("aimv2_vision_model", "AIMv2VisionModel"),
+        ("aimv2_vision_model", "Aimv2VisionModel"),
         ("albert", "ALBERT"),
         ("align", "ALIGN"),
         ("altclip", "AltCLIP"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 1a636a05380..b592f4f9f78 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -32,8 +32,8 @@ logger = logging.get_logger(__name__)
 MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
-        ("aimv2", "AIMv2Model"),
-        ("aimv2_vision_model", "AIMv2VisionModel"),
+        ("aimv2", "Aimv2Model"),
+        ("aimv2_vision_model", "Aimv2VisionModel"),
         ("albert", "AlbertModel"),
         ("align", "AlignModel"),
         ("altclip", "AltCLIPModel"),
@@ -630,7 +630,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
 MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict(
     [
         # Model for Image mapping
-        ("aimv2_vision_model", "AIMv2VisionModel"),
+        ("aimv2_vision_model", "Aimv2VisionModel"),
         ("beit", "BeitModel"),
         ("bit", "BitModel"),
         ("conditional_detr", "ConditionalDetrModel"),
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 1261946c998..93f99f4269c 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -15,6 +15,7 @@
 """Testing suite for the PyTorch AIMv2 model."""
 
 import inspect
+import os
 import tempfile
 import unittest
 
@@ -23,7 +24,7 @@ import requests
 from parameterized import parameterized
 from pytest import mark
 
-from transformers import AIMv2Config, AIMv2TextConfig, AIMv2VisionConfig
+from transformers import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig
 from transformers.testing_utils import (
     is_flaky,
     require_flash_attn,
@@ -56,9 +57,9 @@ if is_torch_available():
     from torch import nn
 
     from transformers import (
-        AIMv2Model,
-        AIMv2TextModel,
-        AIMv2VisionModel,
+        Aimv2Model,
+        Aimv2TextModel,
+        Aimv2VisionModel,
     )
 
 
@@ -109,7 +110,7 @@ class AIMv2VisionModelTester:
         return config, pixel_values
 
     def get_config(self):
-        return AIMv2VisionConfig(
+        return Aimv2VisionConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
@@ -123,7 +124,7 @@ class AIMv2VisionModelTester:
         )
 
     def create_and_check_model(self, config, pixel_values):
-        model = AIMv2VisionModel(config=config)
+        model = Aimv2VisionModel(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
@@ -182,7 +183,7 @@ class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
     attention_mask and seq_length.
     """
 
-    all_model_classes = (AIMv2VisionModel,) if is_torch_available() else ()
+    all_model_classes = (Aimv2VisionModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = False
@@ -191,7 +192,7 @@ class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
     def setUp(self):
         self.model_tester = AIMv2VisionModelTester(self)
         self.config_tester = ConfigTester(
-            self, config_class=AIMv2VisionConfig, has_text_modality=False, hidden_size=37
+            self, config_class=Aimv2VisionConfig, has_text_modality=False, hidden_size=37
         )
 
     def test_config(self):
@@ -281,7 +282,7 @@ class AIMv2TextModelTester:
         return config, input_ids, input_mask
 
     def get_config(self):
-        return AIMv2TextConfig(
+        return Aimv2TextConfig(
             vocab_size=self.vocab_size,
             hidden_size=self.hidden_size,
             projection_dim=self.projection_dim,
@@ -294,7 +295,7 @@ class AIMv2TextModelTester:
         )
 
     def create_and_check_model(self, config, input_ids, input_mask):
-        model = AIMv2TextModel(config=config)
+        model = Aimv2TextModel(config=config)
         model.to(torch_device)
         model.eval()
         with torch.no_grad():
@@ -312,7 +313,7 @@ class AIMv2TextModelTester:
 
 @require_torch
 class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (AIMv2TextModel,) if is_torch_available() else ()
+    all_model_classes = (Aimv2TextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_head_masking = False
@@ -320,7 +321,7 @@ class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = AIMv2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AIMv2TextConfig, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=Aimv2TextConfig, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -356,12 +357,12 @@ class AIMv2ModelTester:
         return config, input_ids, attention_mask, pixel_values
 
     def get_config(self):
-        return AIMv2Config.from_text_vision_configs(
+        return Aimv2Config.from_text_vision_configs(
             self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
         )
 
     def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = AIMv2Model(config).to(torch_device).eval()
+        model = Aimv2Model(config).to(torch_device).eval()
         with torch.no_grad():
             result = model(input_ids, pixel_values, attention_mask)
         self.parent.assertEqual(
@@ -387,9 +388,9 @@ class AIMv2ModelTester:
 @require_torch
 class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     additional_model_inputs = ["pixel_values"]
-    all_model_classes = (AIMv2Model,) if is_torch_available() else ()
+    all_model_classes = (Aimv2Model,) if is_torch_available() else ()
     pipeline_model_mapping = (
-        {"feature-extraction": AIMv2Model, "image-feature-extraction": AIMv2VisionModel}
+        {"feature-extraction": Aimv2Model, "image-feature-extraction": Aimv2VisionModel}
         if is_torch_available()
         else {}
     )
@@ -404,7 +405,7 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         self.model_tester = AIMv2ModelTester(self)
         common_properties = ["projection_dim", "logit_scale_init_value"]
         self.config_tester = ConfigTester(
-            self, config_class=AIMv2Config, has_text_modality=False, common_properties=common_properties
+            self, config_class=Aimv2Config, has_text_modality=False, common_properties=common_properties
         )
 
     def test_model(self):
@@ -427,7 +428,7 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip(reason="AIMv2Model does not have input/output embeddings")
+    @unittest.skip(reason="Aimv2Model does not have input/output embeddings")
     def test_model_get_set_embeddings(self):
         pass
 
@@ -458,16 +459,16 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     def test_load_vision_text_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        # Save AIMv2Config and check if we can load AIMv2VisionConfig from it
+        # Save Aimv2Config and check if we can load Aimv2VisionConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             config.save_pretrained(tmp_dir_name)
-            vision_config = AIMv2VisionConfig.from_pretrained(tmp_dir_name)
+            vision_config = Aimv2VisionConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
 
-        # Save AIMv2Config and check if we can load AIMv2TextConfig from it
+        # Save Aimv2Config and check if we can load Aimv2TextConfig from it
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             config.save_pretrained(tmp_dir_name)
-            text_config = AIMv2TextConfig.from_pretrained(tmp_dir_name)
+            text_config = Aimv2TextConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
 
     @require_flash_attn
@@ -567,6 +568,78 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         # Adding only flaky decorator here and call the parent test method
         return getattr(ModelTesterMixin, self._testMethodName)(self)
 
+    # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest._create_and_check_torchscript with CLIP->AIMv2
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            self.skipTest(reason="test_torchscript is set to False")
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+
+            try:
+                input_ids = inputs_dict["input_ids"]
+                pixel_values = inputs_dict["pixel_values"]  # AIMv2 needs pixel_values
+                traced_model = torch.jit.trace(model, (input_ids, pixel_values))
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
 
 @require_vision
 @require_torch
@@ -574,7 +647,7 @@ class AIMv2ModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
         model_name = "yaswanthgali/aimv2-large-patch14-224-lit-HF"
-        model = AIMv2Model.from_pretrained(model_name, device_map="auto")
+        model = Aimv2Model.from_pretrained(model_name, device_map="auto")
         processor = AutoProcessor.from_pretrained(model_name)
 
         image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
@@ -608,7 +681,7 @@ class AIMv2VisionModelIntegrationTests(unittest.TestCase):
     def test_inference(self):
         model_name = "yaswanthgali/aimv2-large-patch14-224-HF"
 
-        model = AIMv2VisionModel.from_pretrained(model_name, device_map="auto")
+        model = Aimv2VisionModel.from_pretrained(model_name, device_map="auto")
         processor = AutoImageProcessor.from_pretrained(model_name)
 
         image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
@@ -639,7 +712,7 @@ class AIMv2VisionModelIntegrationTests(unittest.TestCase):
     def test_inference_for_native_resolution(self):
         model_name = "yaswanthgali/aimv2-large-patch14-native-HF"
 
-        model = AIMv2VisionModel.from_pretrained(model_name, device_map="auto")
+        model = Aimv2VisionModel.from_pretrained(model_name, device_map="auto")
         processor = AutoImageProcessor.from_pretrained(model_name)
 
         image = image = Image.open(
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 0f2b94e00ac..56ab90f4050 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -190,7 +190,7 @@ TEST_FILES_WITH_NO_COMMON_TESTS = [
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
-    "AIMv2TextModel",
+    "Aimv2TextModel",
     "AlignTextModel",
     "AlignVisionModel",
     "ClapTextModel",

From 9aae4d4711e52058fd5b01055360a9f0c7281761 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 30 Apr 2025 21:38:22 +0530
Subject: [PATCH 51/62] Nit

---
 src/transformers/models/aimv2/modeling_aimv2.py | 9 +++------
 src/transformers/models/aimv2/modular_aimv2.py  | 7 +++----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 8f6f1199eaa..581c29544e6 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -28,15 +28,12 @@ import torch
 import torch.nn.functional as F
 from torch import nn
 
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_outputs import BaseModelOutputWithPooling
-from transformers.modeling_utils import PreTrainedModel
-
 from ...activations import ACT2FN
 from ...integrations import use_kernel_forward_from_hub
+from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutput
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 52f5f67d9e8..5bdcb14b565 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -22,11 +22,10 @@ import torch
 import torch.nn.functional as F
 from torch import nn
 
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_outputs import BaseModelOutputWithPooling
-from transformers.modeling_utils import PreTrainedModel
-
+from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,

From bc5858d5c861924923355ff5202b31df445186f6 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Fri, 16 May 2025 18:31:47 +0530
Subject: [PATCH 52/62] make style

---
 .../models/aimv2/configuration_aimv2.py       |   6 +-
 .../models/aimv2/modeling_aimv2.py            | 100 ++----------------
 .../models/aimv2/modular_aimv2.py             |   4 +-
 3 files changed, 13 insertions(+), 97 deletions(-)

diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py
index ec7a643da6e..bf0064ad9f6 100644
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@@ -19,6 +19,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -186,8 +188,8 @@ class Aimv2TextConfig(PretrainedConfig):
         qkv_bias: bool = False,
         mlp_bias: bool = False,
         hidden_act: str = "silu",
-        pad_token_id: int = None,
-        bos_token_id: int = None,
+        pad_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
         eos_token_id: int = 49407,
         max_position_embeddings: int = 77,
         initializer_range: bool = 0.02,
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 581c29544e6..66ad345280f 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -38,6 +38,7 @@ from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    auto_docstring,
     can_return_tuple,
     logging,
     replace_return_docstrings,
@@ -673,94 +674,7 @@ def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
     return normed_tensor
 
 
-AIMV2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Aimv2Config`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-AIMV2_TEXT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-AIMV2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`Aimv2ImageProcessor.__call__`] for details.
-        return_loss (`bool`, *optional*):
-            Whether or not to return the contrastive loss.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
-            Whether to interpolate the pre-trained position encodings.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(AIMV2_START_DOCSTRING)
+@auto_docstring
 class Aimv2Model(Aimv2PreTrainedModel):
     config_class = Aimv2Config
     _no_split_modules = ["Aimv2TextEmbeddings", "Aimv2EncoderLayer", "Aimv2VisionEmbeddings"]
@@ -783,7 +697,7 @@ class Aimv2Model(Aimv2PreTrainedModel):
 
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(AIMV2_TEXT_INPUTS_DOCSTRING)
+    @auto_docstring
     def get_text_features(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -827,7 +741,7 @@ class Aimv2Model(Aimv2PreTrainedModel):
 
         return text_features
 
-    @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING)
+    @auto_docstring
     def get_image_features(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,
@@ -876,8 +790,7 @@ class Aimv2Model(Aimv2PreTrainedModel):
         return image_features
 
     @can_return_tuple
-    @add_start_docstrings_to_model_forward(AIMV2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Aimv2Output, config_class=Aimv2Config)
+    @auto_docstring
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -888,7 +801,8 @@ class Aimv2Model(Aimv2PreTrainedModel):
         output_hidden_states: Optional[bool] = None,
     ) -> Aimv2Output:
         r"""
-        Returns:
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
 
         Examples:
 
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 5bdcb14b565..9572dd672be 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -200,8 +200,8 @@ class Aimv2TextConfig(SiglipTextConfig):
         qkv_bias: bool = False,
         mlp_bias: bool = False,
         hidden_act: str = "silu",
-        pad_token_id: int = None,
-        bos_token_id: int = None,
+        pad_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
         eos_token_id: int = 49407,
         max_position_embeddings: int = 77,
         initializer_range: bool = 0.02,

From 5bc37096c02f647a1099b4d922b1ab8bd4f3ea91 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 28 May 2025 21:23:43 +0530
Subject: [PATCH 53/62] Add model doc pointer

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 873df4aa86a..700e218d8be 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -369,6 +369,8 @@
     title: Main Classes
   - sections:
     - sections:
+      - local: model_doc/aimv2
+        title: Aimv2
       - local: model_doc/albert
         title: ALBERT
       - local: model_doc/bamba

From 13966059b77def1818bfcab77e8c611f51e673c7 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Thu, 29 May 2025 08:06:09 +0530
Subject: [PATCH 54/62] make style

---
 src/transformers/models/auto/tokenization_auto.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index a5e0064dc21..d62a886e514 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -56,13 +56,13 @@ logger = logging.get_logger(__name__)
 # Explicit rather than inferred generics to significantly improves completion suggestion performance for language servers.
 TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]](
     [
-      (
+        (
             "aimv2",
-             (
+            (
                 "CLIPTokenizer",
                 "CLIPTokenizerFast" if is_tokenizers_available() else None,
-             ),
             ),
+        ),
         (
             "albert",
             (

From 164d0e321351df4d625dacc21aa6eebe0c42c89a Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Thu, 29 May 2025 08:08:31 +0530
Subject: [PATCH 55/62] Update model doc section

---
 docs/source/en/_toctree.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index d3e7c9438be..4f1337a56ff 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -369,8 +369,6 @@
     title: Main Classes
   - sections:
     - sections:
-      - local: model_doc/aimv2
-        title: Aimv2
       - local: model_doc/albert
         title: ALBERT
       - local: model_doc/bamba
@@ -695,6 +693,8 @@
         title: Zamba2
       title: Text models
     - sections:
+      - local: model_doc/aimv2
+        title: Aimv2
       - local: model_doc/beit
         title: BEiT
       - local: model_doc/bit

From 95506b5a7478ec9aa262c912e5a836112ef6e28d Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 18 Jun 2025 00:05:38 +0530
Subject: [PATCH 56/62] updates

---
 .../models/aimv2/modeling_aimv2.py            | 66 ++++++-------------
 .../models/aimv2/modular_aimv2.py             | 62 ++++++-----------
 .../models/siglip/modeling_siglip.py          |  1 -
 .../models/siglip2/modeling_siglip2.py        |  1 -
 tests/models/aimv2/test_modeling_aimv2.py     |  2 +-
 5 files changed, 40 insertions(+), 92 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 66ad345280f..666535cb7c4 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -34,15 +34,7 @@ from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    auto_docstring,
-    can_return_tuple,
-    logging,
-    replace_return_docstrings,
-)
+from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging, replace_return_docstrings
 from .configuration_aimv2 import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig
 
 
@@ -449,6 +441,7 @@ class Aimv2AttentionPoolingHead(nn.Module):
         return output
 
 
+@auto_docstring
 class Aimv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -458,7 +451,12 @@ class Aimv2PreTrainedModel(PreTrainedModel):
     config_class = Aimv2Config
     base_model_prefix = "aimv2"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["AIMv2SwiGLUFFN"]
+    _no_split_modules = [
+        "Aimv2EncoderLayer",
+        "Aimv2AttentionPoolingHead",
+        "Aimv2VisionEmbeddings",
+        "Aimv2TextEmbeddings",
+    ]
     _supports_sdpa = True
 
     def _init_weights(self, module):
@@ -482,42 +480,10 @@ class Aimv2PreTrainedModel(PreTrainedModel):
             module.cls_token.data.normal_(mean=0.0, std=std)
 
 
-AIMV2_VISION_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Aimv2VisionConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-AIMV2_VISION_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
-            Whether to interpolate the pre-trained position encodings.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    """The vision model from AIMv2 without any head or projection on top.""",
-    AIMV2_VISION_START_DOCSTRING,
+@auto_docstring(
+    custom_intro="""
+    The Vision model from AIMv2 without any head or projection on top.
+    """
 )
 class Aimv2VisionModel(Aimv2PreTrainedModel):
     main_input_name = "pixel_values"
@@ -540,7 +506,7 @@ class Aimv2VisionModel(Aimv2PreTrainedModel):
         return self.embeddings.patch_embed
 
     @can_return_tuple
-    @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING)
+    @auto_docstring
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Aimv2VisionConfig)
     def forward(
         self,
@@ -597,6 +563,11 @@ class Aimv2VisionModel(Aimv2PreTrainedModel):
         )
 
 
+@auto_docstring(
+    custom_intro="""
+    The text model from AIMv2 without any head or projection on top.
+    """
+)
 class Aimv2TextModel(Aimv2PreTrainedModel):
     main_input_name = "input_ids"
 
@@ -618,6 +589,7 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         self.embeddings.token_embedding = value
 
     @can_return_tuple
+    @auto_docstring
     def forward(
         self,
         input_ids,
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index 9572dd672be..a45f03ebf43 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -27,8 +27,7 @@ from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
+    auto_docstring,
     can_return_tuple,
     logging,
     replace_return_docstrings,
@@ -289,8 +288,6 @@ class Aimv2Config(SiglipConfig):
 
         del self.initializer_factor
 
-    pass
-
 
 class Aimv2Output(SiglipOutput):
     pass
@@ -434,39 +431,7 @@ class Aimv2AttentionPoolingHead(nn.Module):
         return output
 
 
-AIMV2_VISION_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Aimv2VisionConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-AIMV2_VISION_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
-            Whether to interpolate the pre-trained position encodings.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
+@auto_docstring
 class Aimv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -476,7 +441,12 @@ class Aimv2PreTrainedModel(PreTrainedModel):
     config_class = Aimv2Config
     base_model_prefix = "aimv2"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["AIMv2SwiGLUFFN"]
+    _no_split_modules = [
+        "Aimv2EncoderLayer",
+        "Aimv2AttentionPoolingHead",
+        "Aimv2VisionEmbeddings",
+        "Aimv2TextEmbeddings",
+    ]
     _supports_sdpa = True
 
     def _init_weights(self, module):
@@ -500,9 +470,10 @@ class Aimv2PreTrainedModel(PreTrainedModel):
             module.cls_token.data.normal_(mean=0.0, std=std)
 
 
-@add_start_docstrings(
-    """The vision model from AIMv2 without any head or projection on top.""",
-    AIMV2_VISION_START_DOCSTRING,
+@auto_docstring(
+    custom_intro="""
+    The Vision model from AIMv2 without any head or projection on top.
+    """
 )
 class Aimv2VisionModel(Aimv2PreTrainedModel):
     main_input_name = "pixel_values"
@@ -525,7 +496,7 @@ class Aimv2VisionModel(Aimv2PreTrainedModel):
         return self.embeddings.patch_embed
 
     @can_return_tuple
-    @add_start_docstrings_to_model_forward(AIMV2_VISION_INPUTS_DOCSTRING)
+    @auto_docstring
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Aimv2VisionConfig)
     def forward(
         self,
@@ -582,6 +553,11 @@ class Aimv2VisionModel(Aimv2PreTrainedModel):
         )
 
 
+@auto_docstring(
+    custom_intro="""
+    The text model from AIMv2 without any head or projection on top.
+    """
+)
 class Aimv2TextModel(Aimv2PreTrainedModel):
     main_input_name = "input_ids"
 
@@ -603,6 +579,7 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         self.embeddings.token_embedding = value
 
     @can_return_tuple
+    @auto_docstring
     def forward(
         self,
         input_ids,
@@ -648,6 +625,7 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         )
 
 
+@auto_docstring
 class Aimv2Model(CLIPModel, nn.Module):
     def __init__(self, config: Aimv2Config):
         nn.Module().__init__(config)
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index fa5e3f6dc84..0756180e590 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -510,7 +510,6 @@ class SiglipPreTrainedModel(PreTrainedModel):
 
     _no_split_modules = [
         "SiglipTextEmbeddings",
-        "SiglipEncoderLayer",
         "SiglipVisionEmbeddings",
         "SiglipEncoderLayer",
         "SiglipMultiheadAttentionPoolingHead",
diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py
index eb3bf5d4a34..f3e79f143bb 100644
--- a/src/transformers/models/siglip2/modeling_siglip2.py
+++ b/src/transformers/models/siglip2/modeling_siglip2.py
@@ -742,7 +742,6 @@ class Siglip2PreTrainedModel(PreTrainedModel):
 
     _no_split_modules = [
         "Siglip2TextEmbeddings",
-        "Siglip2EncoderLayer",
         "Siglip2VisionEmbeddings",
         "Siglip2EncoderLayer",
         "Siglip2MultiheadAttentionPoolingHead",
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 93f99f4269c..ee66e94090d 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From dad9d00cdc0a64777c8bfddb59291686445b162f Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Wed, 25 Jun 2025 23:02:59 +0530
Subject: [PATCH 57/62] Modify attn mask and interface

---
 .../convert_aimv2_original_pytorch_to_hf.py   |  4 +-
 .../models/aimv2/modeling_aimv2.py            | 78 ++++++++-----------
 .../models/aimv2/modular_aimv2.py             | 26 +++----
 3 files changed, 46 insertions(+), 62 deletions(-)

diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
index 17c419074d0..824d6b5138f 100644
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@@ -17,7 +17,7 @@ import argparse
 import gc
 import os
 import re
-from typing import Dict, Optional
+from typing import Optional
 
 import torch
 from huggingface_hub import snapshot_download
@@ -90,7 +90,7 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
 }
 
 
-def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> Dict[str, torch.Tensor]:
+def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> dict[str, torch.Tensor]:
     # Download only the model.safetensors file
     directory_path = snapshot_download(
         repo_id=model_id,
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 666535cb7c4..82c3b1847a8 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -22,7 +22,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional
 
 import torch
 import torch.nn.functional as F
@@ -30,37 +30,34 @@ from torch import nn
 
 from ...activations import ACT2FN
 from ...integrations import use_kernel_forward_from_hub
-from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...masking_utils import create_causal_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging, replace_return_docstrings
+from ...utils import ModelOutput, auto_docstring, can_return_tuple
 from .configuration_aimv2 import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig
 
 
-logger = logging.get_logger(__name__)
-
-
 @dataclass
+@auto_docstring
 class Aimv2Output(ModelOutput):
-    """
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
-            Contrastive loss for image-text similarity.
-        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
-            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
-            similarity scores.
-        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
-            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
-            similarity scores.
-        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
-        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
-        text_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`Aimv2TextModel`].
-        vision_model_output (`BaseModelOutputWithPooling`):
-            The output of the [`Aimv2VisionModel`].
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+        Contrastive loss for image-text similarity.
+    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+        similarity scores.
+    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+        similarity scores.
+    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
+    text_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`Aimv2TextModel`].
+    vision_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`Aimv2VisionModel`].
     """
 
     loss: Optional[torch.FloatTensor] = None
@@ -71,7 +68,7 @@ class Aimv2Output(ModelOutput):
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
-    def to_tuple(self) -> Tuple[Any]:
+    def to_tuple(self) -> tuple[Any]:
         return tuple(
             self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
             for k in self.keys()
@@ -256,8 +253,7 @@ class Aimv2Attention(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
         batch_size, seq_length, embed_dim = hidden_states.shape
@@ -272,13 +268,7 @@ class Aimv2Attention(nn.Module):
 
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
-            if self.config._attn_implementation == "sdpa" and output_attentions:
-                logger.warning_once(
-                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
-                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-                )
-            else:
-                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
         attn_output, attn_weights = attention_interface(
             self,
@@ -294,9 +284,6 @@ class Aimv2Attention(nn.Module):
         attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
         attn_output = self.out_proj(attn_output)
 
-        if not output_attentions:
-            attn_weights = None
-
         return attn_output, attn_weights
 
 
@@ -313,11 +300,9 @@ class Aimv2EncoderLayer(GradientCheckpointingLayer):
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
-        attn_output, attn_weights = self.attention(
-            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-        )
+        attn_output, attn_weights = self.attention(hidden_states=norm_hidden_states, attention_mask=attention_mask)
 
         hidden_states = hidden_states + attn_output
         norm_hidden_states = self.rms_norm2(hidden_states)
@@ -507,7 +492,6 @@ class Aimv2VisionModel(Aimv2PreTrainedModel):
 
     @can_return_tuple
     @auto_docstring
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Aimv2VisionConfig)
     def forward(
         self,
         pixel_values,
@@ -606,9 +590,13 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         _, seq_len, _ = hidden_states.shape
 
         if attention_mask is not None:
-            mask_converter = AttentionMaskConverter(True)
-            attention_mask = mask_converter.to_4d(
-                attention_mask, key_value_length=seq_len, query_length=seq_len, dtype=hidden_states.dtype
+            cache_position = torch.arange(seq_len, device=hidden_states.device)
+            attention_mask = create_causal_mask(
+                config=self.config,
+                input_embeds=hidden_states,
+                attention_mask=attention_mask,
+                cache_position=cache_position,
+                past_key_values=None,
             )
 
         encoder_outputs = self.encoder(
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index a45f03ebf43..b13368fc18c 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -16,21 +16,19 @@
 """Pytorch implementation of AIMv2 Model"""
 
 import math
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 import torch.nn.functional as F
 from torch import nn
 
-from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...masking_utils import create_causal_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPooling
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     auto_docstring,
     can_return_tuple,
-    logging,
-    replace_return_docstrings,
 )
 from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm
 from ..llama.modeling_llama import LlamaMLP, LlamaRMSNorm
@@ -38,9 +36,6 @@ from ..siglip.configuration_siglip import SiglipConfig, SiglipTextConfig, Siglip
 from ..siglip.modeling_siglip import SiglipAttention, SiglipEncoder, SiglipOutput
 
 
-logger = logging.get_logger(__name__)
-
-
 class Aimv2VisionConfig(SiglipVisionConfig):
     r"""
     This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
@@ -379,11 +374,9 @@ class Aimv2EncoderLayer(GradientCheckpointingLayer):
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         norm_hidden_states = self.rms_norm1(hidden_states)
-        attn_output, attn_weights = self.attention(
-            hidden_states=norm_hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-        )
+        attn_output, attn_weights = self.attention(hidden_states=norm_hidden_states, attention_mask=attention_mask)
 
         hidden_states = hidden_states + attn_output
         norm_hidden_states = self.rms_norm2(hidden_states)
@@ -497,7 +490,6 @@ class Aimv2VisionModel(Aimv2PreTrainedModel):
 
     @can_return_tuple
     @auto_docstring
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Aimv2VisionConfig)
     def forward(
         self,
         pixel_values,
@@ -596,9 +588,13 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         _, seq_len, _ = hidden_states.shape
 
         if attention_mask is not None:
-            mask_converter = AttentionMaskConverter(True)
-            attention_mask = mask_converter.to_4d(
-                attention_mask, key_value_length=seq_len, query_length=seq_len, dtype=hidden_states.dtype
+            cache_position = torch.arange(seq_len, device=hidden_states.device)
+            attention_mask = create_causal_mask(
+                config=self.config,
+                input_embeds=hidden_states,
+                attention_mask=attention_mask,
+                cache_position=cache_position,
+                past_key_values=None,
             )
 
         encoder_outputs = self.encoder(

From 781834a94c85fb0d4762e8d2a2b6bb85bfd4a16e Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Thu, 26 Jun 2025 00:31:05 +0530
Subject: [PATCH 58/62] update test

---
 tests/models/aimv2/test_modeling_aimv2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index ee66e94090d..da0de82b4b5 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -670,13 +670,14 @@ class AIMv2ModelIntegrationTest(unittest.TestCase):
         )
 
         # handle device
-        expected_logits = torch.tensor([[34.2415, 24.6724]]).to(model.device)
+        expected_logits = torch.tensor([[33.3550, 26.4255]]).to(model.device)
         self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
 
 
 @require_vision
 @require_torch
 class AIMv2VisionModelIntegrationTests(unittest.TestCase):
+
     @slow
     def test_inference(self):
         model_name = "yaswanthgali/aimv2-large-patch14-224-HF"

From 8eac21d927a6920d65815f313196dfa9955ec182 Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Fri, 27 Jun 2025 16:02:09 +0530
Subject: [PATCH 59/62] Final change

---
 src/transformers/models/aimv2/modeling_aimv2.py | 1 +
 tests/models/aimv2/test_modeling_aimv2.py       | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 82c3b1847a8..b98f1f68773 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -253,6 +253,7 @@ class Aimv2Attention(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index da0de82b4b5..97789573259 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -677,7 +677,6 @@ class AIMv2ModelIntegrationTest(unittest.TestCase):
 @require_vision
 @require_torch
 class AIMv2VisionModelIntegrationTests(unittest.TestCase):
-
     @slow
     def test_inference(self):
         model_name = "yaswanthgali/aimv2-large-patch14-224-HF"

From 39ecc8d6b68e8755c41a2cbaf506d70605b0a4ad Mon Sep 17 00:00:00 2001
From: yaswant19 <yaswanthgali8@gmail.com>
Date: Fri, 27 Jun 2025 17:12:59 +0530
Subject: [PATCH 60/62] Utilize flash and flex attn

---
 .../models/aimv2/modeling_aimv2.py            | 21 ++++++++++---------
 .../models/aimv2/modular_aimv2.py             | 21 +++++++++++--------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index b98f1f68773..2f2ad49a6ab 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -444,6 +444,8 @@ class Aimv2PreTrainedModel(PreTrainedModel):
         "Aimv2TextEmbeddings",
     ]
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
+    _supports_flex_attn = True
 
     def _init_weights(self, module):
         std = (
@@ -590,15 +592,14 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         hidden_states = self.embeddings(input_ids)
         _, seq_len, _ = hidden_states.shape
 
-        if attention_mask is not None:
-            cache_position = torch.arange(seq_len, device=hidden_states.device)
-            attention_mask = create_causal_mask(
-                config=self.config,
-                input_embeds=hidden_states,
-                attention_mask=attention_mask,
-                cache_position=cache_position,
-                past_key_values=None,
-            )
+        cache_position = torch.arange(seq_len, device=hidden_states.device)
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=hidden_states,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=None,
+        )
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
@@ -750,8 +751,8 @@ class Aimv2Model(Aimv2PreTrainedModel):
 
         return image_features
 
-    @can_return_tuple
     @auto_docstring
+    @can_return_tuple
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index b13368fc18c..b7bd720588e 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -441,6 +441,8 @@ class Aimv2PreTrainedModel(PreTrainedModel):
         "Aimv2TextEmbeddings",
     ]
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
+    _supports_flex_attn = True
 
     def _init_weights(self, module):
         std = (
@@ -587,15 +589,14 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         hidden_states = self.embeddings(input_ids)
         _, seq_len, _ = hidden_states.shape
 
-        if attention_mask is not None:
-            cache_position = torch.arange(seq_len, device=hidden_states.device)
-            attention_mask = create_causal_mask(
-                config=self.config,
-                input_embeds=hidden_states,
-                attention_mask=attention_mask,
-                cache_position=cache_position,
-                past_key_values=None,
-            )
+        cache_position = torch.arange(seq_len, device=hidden_states.device)
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=hidden_states,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=None,
+        )
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
@@ -641,6 +642,8 @@ class Aimv2Model(CLIPModel, nn.Module):
 
         self.post_init()
 
+    @auto_docstring
+    @can_return_tuple
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,

From 0627ef687476cf47ed44dc0ff99398f38a4ce652 Mon Sep 17 00:00:00 2001
From: yaswanth <yaswanthgali8@gmail.com>
Date: Sun, 29 Jun 2025 01:54:56 +0530
Subject: [PATCH 61/62] keep attn mask

---
 src/transformers/models/aimv2/modeling_aimv2.py | 15 ++++++++-------
 src/transformers/models/aimv2/modular_aimv2.py  | 15 ++++++++-------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 2f2ad49a6ab..bcef1de4e79 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -593,13 +593,14 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         _, seq_len, _ = hidden_states.shape
 
         cache_position = torch.arange(seq_len, device=hidden_states.device)
-        attention_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=attention_mask,
-            cache_position=cache_position,
-            past_key_values=None,
-        )
+        if attention_mask is not None:
+            attention_mask = create_causal_mask(
+                config=self.config,
+                input_embeds=hidden_states,
+                attention_mask=attention_mask,
+                cache_position=cache_position,
+                past_key_values=None,
+            )
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
index b7bd720588e..42d297199d5 100644
--- a/src/transformers/models/aimv2/modular_aimv2.py
+++ b/src/transformers/models/aimv2/modular_aimv2.py
@@ -590,13 +590,14 @@ class Aimv2TextModel(Aimv2PreTrainedModel):
         _, seq_len, _ = hidden_states.shape
 
         cache_position = torch.arange(seq_len, device=hidden_states.device)
-        attention_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=attention_mask,
-            cache_position=cache_position,
-            past_key_values=None,
-        )
+        if attention_mask is not None:
+            attention_mask = create_causal_mask(
+                config=self.config,
+                input_embeds=hidden_states,
+                attention_mask=attention_mask,
+                cache_position=cache_position,
+                past_key_values=None,
+            )
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,

From 446df75b30cb7f9382baf24c7efa1f85f62462a4 Mon Sep 17 00:00:00 2001
From: yaswanth <yaswanthgali8@gmail.com>
Date: Sun, 29 Jun 2025 01:55:18 +0530
Subject: [PATCH 62/62] camelcase model name in test file

---
 tests/models/aimv2/test_modeling_aimv2.py | 44 +++++++++++------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py
index 97789573259..46a3913bfa6 100644
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py
@@ -69,7 +69,7 @@ if is_vision_available():
     from transformers import AutoImageProcessor, AutoProcessor
 
 
-class AIMv2VisionModelTester:
+class Aimv2VisionModelTester:
     def __init__(
         self,
         parent,
@@ -139,10 +139,10 @@ class AIMv2VisionModelTester:
         return config, inputs_dict
 
 
-class AIMv2ModelTesterMixin(ModelTesterMixin):
+class Aimv2ModelTesterMixin(ModelTesterMixin):
     """
-    Subclass of ModelTesterMixin with methods specific to testing AIMv2 models.
-    The SDPA equivalence test is overridden here because AIMv2 models may have test/vision/text+vision inputs,
+    Subclass of ModelTesterMixin with methods specific to testing Aimv2 models.
+    The SDPA equivalence test is overridden here because Aimv2 models may have test/vision/text+vision inputs,
     different output logits, and are not supposed to be used or tested with padding_side="left".
     """
 
@@ -177,9 +177,9 @@ class AIMv2ModelTesterMixin(ModelTesterMixin):
 
 
 @require_torch
-class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
+class Aimv2VisionModelTest(Aimv2ModelTesterMixin, unittest.TestCase):
     """
-    Here we also overwrite some of the tests of test_modeling_common.py, as AIMv2 does not use input_ids, inputs_embeds,
+    Here we also overwrite some of the tests of test_modeling_common.py, as Aimv2 does not use input_ids, inputs_embeds,
     attention_mask and seq_length.
     """
 
@@ -190,7 +190,7 @@ class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = AIMv2VisionModelTester(self)
+        self.model_tester = Aimv2VisionModelTester(self)
         self.config_tester = ConfigTester(
             self, config_class=Aimv2VisionConfig, has_text_modality=False, hidden_size=37
         )
@@ -198,7 +198,7 @@ class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    @unittest.skip(reason="AIMv2 does not use inputs_embeds")
+    @unittest.skip(reason="Aimv2 does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
@@ -228,7 +228,7 @@ class AIMv2VisionModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
         self.model_tester.create_and_check_model(*config_and_inputs)
 
 
-class AIMv2TextModelTester:
+class Aimv2TextModelTester:
     def __init__(
         self,
         parent,
@@ -312,7 +312,7 @@ class AIMv2TextModelTester:
 
 
 @require_torch
-class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
+class Aimv2TextModelTest(Aimv2ModelTesterMixin, unittest.TestCase):
     all_model_classes = (Aimv2TextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -320,7 +320,7 @@ class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
     test_resize_embeddings = False
 
     def setUp(self):
-        self.model_tester = AIMv2TextModelTester(self)
+        self.model_tester = Aimv2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Aimv2TextConfig, hidden_size=37)
 
     def test_config(self):
@@ -330,12 +330,12 @@ class AIMv2TextModelTest(AIMv2ModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip(reason="AIMv2 does not use inputs_embeds")
+    @unittest.skip(reason="Aimv2 does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass
 
 
-class AIMv2ModelTester:
+class Aimv2ModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=False):
         if text_kwargs is None:
             text_kwargs = {}
@@ -343,8 +343,8 @@ class AIMv2ModelTester:
             vision_kwargs = {}
 
         self.parent = parent
-        self.text_model_tester = AIMv2TextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = AIMv2VisionModelTester(parent, **vision_kwargs)
+        self.text_model_tester = Aimv2TextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = Aimv2VisionModelTester(parent, **vision_kwargs)
         self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
         self.is_training = is_training
 
@@ -386,7 +386,7 @@ class AIMv2ModelTester:
 
 
 @require_torch
-class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class Aimv2ModelTest(Aimv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     additional_model_inputs = ["pixel_values"]
     all_model_classes = (Aimv2Model,) if is_torch_available() else ()
     pipeline_model_mapping = (
@@ -402,7 +402,7 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     _is_composite = True
 
     def setUp(self):
-        self.model_tester = AIMv2ModelTester(self)
+        self.model_tester = Aimv2ModelTester(self)
         common_properties = ["projection_dim", "logit_scale_init_value"]
         self.config_tester = ConfigTester(
             self, config_class=Aimv2Config, has_text_modality=False, common_properties=common_properties
@@ -432,7 +432,7 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
     def test_model_get_set_embeddings(self):
         pass
 
-    # Override as the `logit_scale` parameter initialization is different for AIMv2
+    # Override as the `logit_scale` parameter initialization is different for Aimv2
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -568,7 +568,7 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
         # Adding only flaky decorator here and call the parent test method
         return getattr(ModelTesterMixin, self._testMethodName)(self)
 
-    # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest._create_and_check_torchscript with CLIP->AIMv2
+    # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest._create_and_check_torchscript with CLIP->Aimv2
     def _create_and_check_torchscript(self, config, inputs_dict):
         if not self.test_torchscript:
             self.skipTest(reason="test_torchscript is set to False")
@@ -583,7 +583,7 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
             try:
                 input_ids = inputs_dict["input_ids"]
-                pixel_values = inputs_dict["pixel_values"]  # AIMv2 needs pixel_values
+                pixel_values = inputs_dict["pixel_values"]  # Aimv2 needs pixel_values
                 traced_model = torch.jit.trace(model, (input_ids, pixel_values))
             except RuntimeError:
                 self.fail("Couldn't trace module.")
@@ -643,7 +643,7 @@ class AIMv2ModelTest(AIMv2ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
 
 @require_vision
 @require_torch
-class AIMv2ModelIntegrationTest(unittest.TestCase):
+class Aimv2ModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference(self):
         model_name = "yaswanthgali/aimv2-large-patch14-224-lit-HF"
@@ -676,7 +676,7 @@ class AIMv2ModelIntegrationTest(unittest.TestCase):
 
 @require_vision
 @require_torch
-class AIMv2VisionModelIntegrationTests(unittest.TestCase):
+class Aimv2VisionModelIntegrationTests(unittest.TestCase):
     @slow
     def test_inference(self):
         model_name = "yaswanthgali/aimv2-large-patch14-224-HF"