Model skelton

2025-08-03 03:31:05 +06:00 · 2025-03-07 13:59:06 +05:30 · 2025-03-07 13:59:06 +05:30 · 8198d49871
commit 8198d49871
parent c9d1e5238a
15 changed files with 1945 additions and 0 deletions
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -1,3 +1,4 @@
+- sections:
 - sections:
  - local: index
    title: Transformers
@ -6,8 +7,11 @@
  - local: quicktour
    title: Quickstart
  title: Get started
+- isExpanded: false
+  title: Get started
 - isExpanded: false
  sections:
+  - sections:
  - sections:
    - local: models
      title: Loading models
@ -30,6 +34,8 @@
    - local: attention
      title: Attention mechanisms
    title: Models
+  - sections:
+    title: Models
  - sections:
    - local: fast_tokenizers
      title: Tokenizers
@ -47,8 +53,12 @@
      title: Padding and truncation
    title: Preprocessors
  title: Base classes
+- isExpanded: false
+    title: Preprocessors
+  title: Base classes
 - isExpanded: false
  sections:
+  - sections:
  - sections:
    - local: pipeline_tutorial
      title: Pipeline
@ -59,6 +69,8 @@
    - local: add_new_pipeline
      title: Adding a new pipeline
    title: Pipeline API
+  - sections:
+    title: Pipeline API
  - sections:
    - local: llm_tutorial
      title: Text generation
@ -81,6 +93,8 @@
    - local: perplexity
      title: Perplexity of fixed-length models
    title: LLMs
+  - sections:
+    title: LLMs
  - sections:
    - local: conversations
      title: Chat basics
@ -93,6 +107,8 @@
    - local: chat_extras
      title: Tools and RAG
    title: Chat with models
+  - sections:
+    title: Chat with models
  - sections:
    - local: perf_torch_compile
      title: torch.compile
@ -105,13 +121,17 @@
    - local: tf_xla
      title: XLA
    title: Optimization
+    title: Optimization
  - local: agents
    title: Agents
  - local: tools
    title: Tools
  title: Inference
+- isExpanded: false
+  title: Inference
 - isExpanded: false
  sections:
+  - sections:
  - sections:
    - local: trainer
      title: Trainer
@ -122,6 +142,8 @@
    - local: hpo_train
      title: Hyperparameter search
    title: Trainer API
+  - sections:
+    title: Trainer API
  - sections:
    - local: gpu_selection
      title: GPU selection
@ -138,6 +160,8 @@
    - local: perf_train_gpu_many
      title: Parallelism methods
    title: Distributed training
+  - sections:
+    title: Distributed training
  - sections:
    - local: perf_train_gpu_one
      title: GPU
@ -150,11 +174,14 @@
    - local: perf_hardware
      title: Build your own machine
    title: Hardware
+    title: Hardware
  - local: peft
    title: PEFT
  - local: model_memory_anatomy
    title: Model training anatomy
  title: Training
+- isExpanded: false
+  title: Training
 - isExpanded: false
  sections:
  - local: quantization/overview
@ -198,6 +225,8 @@
  - local: quantization/contribute
    title: Contribute
  title: Quantization
+- isExpanded: false
+  title: Quantization
 - isExpanded: false
  sections:
  - local: serialization
@ -209,8 +238,12 @@
  - local: torchscript
    title: TorchScript
  title: Export to production
+- isExpanded: false
+  title: Export to production
 - isExpanded: false
  sections:
+  - sections:
+    - sections:
  - sections:
    - sections:
      - local: tasks/sequence_classification
@ -230,12 +263,16 @@
      - local: tasks/multiple_choice
        title: Multiple choice
      title: Natural language processing
+    - sections:
+      title: Natural language processing
    - sections:
      - local: tasks/audio_classification
        title: Audio classification
      - local: tasks/asr
        title: Automatic speech recognition
      title: Audio
+    - sections:
+      title: Audio
    - sections:
      - local: tasks/image_classification
        title: Image classification
@ -262,6 +299,8 @@
      - local: tasks/knowledge_distillation_for_image_classification
        title: Knowledge Distillation for Computer Vision
      title: Computer vision
+    - sections:
+      title: Computer vision
    - sections:
      - local: tasks/image_captioning
        title: Image captioning
@ -279,6 +318,8 @@
        title: Video-text-to-text
      title: Multimodal
    title: Task recipes
+      title: Multimodal
+    title: Task recipes
  - local: run_scripts
    title: Training scripts
  - local: glossary
@ -292,6 +333,8 @@
  - local: troubleshooting
    title: Troubleshoot
  title: Resources
+- isExpanded: false
+  title: Resources
 - isExpanded: false
  sections:
  - local: contributing
@ -301,8 +344,11 @@
  - local: pr_checks
    title: Pull request checks
  title: Contribute
+- isExpanded: false
+  title: Contribute
 - isExpanded: false
  sections:
+  - sections:
  - sections:
    - local: main_classes/agent
      title: Agents and Tools
@ -351,6 +397,9 @@
    - local: main_classes/image_processor
      title: Image Processor
    title: Main classes
+  - sections:
+    - sections:
+    title: Main classes
  - sections:
    - sections:
      - local: model_doc/albert
@ -664,6 +713,8 @@
      - local: model_doc/zamba2
        title: Zamba2
      title: Text models
+    - sections:
+      title: Text models
    - sections:
      - local: model_doc/beit
        title: BEiT
@ -794,6 +845,8 @@
      - local: model_doc/zoedepth
        title: ZoeDepth
      title: Vision models
+    - sections:
+      title: Vision models
    - sections:
      - local: model_doc/audio-spectrogram-transformer
        title: Audio Spectrogram Transformer
@ -864,6 +917,8 @@
      - local: model_doc/xlsr_wav2vec2
        title: XLSR-Wav2Vec2
      title: Audio models
+    - sections:
+      title: Audio models
    - sections:
      - local: model_doc/timesformer
        title: TimeSformer
@ -873,6 +928,8 @@
        title: ViViT
      title: Video models
    - sections:
+      - local: model_doc/aimv2
+        title: AIMv2
      - local: model_doc/align
        title: ALIGN
      - local: model_doc/altclip
@ -1020,12 +1077,16 @@
      - local: model_doc/xclip
        title: X-CLIP
      title: Multimodal models
+    - sections:
+      title: Multimodal models
    - sections:
      - local: model_doc/decision_transformer
        title: Decision Transformer
      - local: model_doc/trajectory_transformer
        title: Trajectory Transformer
      title: Reinforcement learning models
+    - sections:
+      title: Reinforcement learning models
    - sections:
      - local: model_doc/autoformer
        title: Autoformer
@ -1038,11 +1099,16 @@
      - local: model_doc/time_series_transformer
        title: Time Series Transformer
      title: Time series models
+    - sections:
+      title: Time series models
    - sections:
      - local: model_doc/graphormer
        title: Graphormer
      title: Graph models
    title: Models
+  - sections:
+      title: Graph models
+    title: Models
  - sections:
    - local: internal/modeling_utils
      title: Custom Layers and Utilities
@ -1066,3 +1132,6 @@
      title: Utilities for Time Series
    title: Internal helpers
  title: API
+
+    title: Internal helpers
+  title: API
--- a/docs/source/en/model_doc/aimv2.md
+++ b/docs/source/en/model_doc/aimv2.md
@ -0,0 +1,82 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# AIMv2
+
+## Overview
+
+The AIMv2 model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## AIMv2Config
+
+[[autodoc]] AIMv2Config
+    - from_text_vision_configs
+
+## AIMv2TextConfig
+
+[[autodoc]] AIMv2TextConfig
+
+## AIMv2VisionConfig
+
+[[autodoc]] AIMv2VisionConfig
+
+## AIMv2Model
+
+[[autodoc]] AIMv2Model
+    - forward
+    - get_text_features
+    - get_image_features
+
+## AIMv2TextModel
+
+[[autodoc]] AIMv2TextModel
+    - forward
+
+## AIMv2TextModelWithProjection
+
+[[autodoc]] AIMv2TextModelWithProjection
+    - forward
+
+## AIMv2VisionModelWithProjection
+
+[[autodoc]] AIMv2VisionModelWithProjection
+    - forward
+
+## AIMv2VisionModel
+
+[[autodoc]] AIMv2VisionModel
+    - forward
+
+## AIMv2ForImageClassification
+
+[[autodoc]] AIMv2ForImageClassification
+    - forward
+
+</pt>
+<tf>
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -289,6 +289,11 @@ _import_structure = {
        "CLIPTokenizer",
        "CLIPVisionConfig",
    ],
+    "models.aimv2": [
+        "AIMv2Config",
+        "AIMv2TextConfig",
+        "AIMv2VisionConfig",
+    ],
    "models.clipseg": [
        "CLIPSegConfig",
        "CLIPSegProcessor",
@ -1852,6 +1857,17 @@ else:
            "CLIPVisionModelWithProjection",
        ]
    )
+    _import_structure["models.aimv2"].extend(
+        [
+            "AIMv2ForImageClassification",
+            "AIMv2Model",
+            "AIMv2PreTrainedModel",
+            "AIMv2TextModel",
+            "AIMv2TextModelWithProjection",
+            "AIMv2VisionModel",
+            "AIMv2VisionModelWithProjection",
+        ]
+    )
    _import_structure["models.clipseg"].extend(
        [
            "CLIPSegForImageSegmentation",
@ -5459,6 +5475,13 @@ if TYPE_CHECKING:
        CLIPTokenizer,
        CLIPVisionConfig,
    )
+    from .models.aimv2 import (
+        AIMv2Config,
+       
+        AIMv2TextConfig,
+       
+        AIMv2VisionConfig,
+    )
    from .models.clipseg import (
        CLIPSegConfig,
        CLIPSegProcessor,
@ -7010,6 +7033,15 @@ if TYPE_CHECKING:
            CLIPVisionModel,
            CLIPVisionModelWithProjection,
        )
+        from .models.aimv2 import (
+            AIMv2ForImageClassification,
+            AIMv2Model,
+            AIMv2PreTrainedModel,
+            AIMv2TextModel,
+            AIMv2TextModelWithProjection,
+            AIMv2VisionModel,
+            AIMv2VisionModelWithProjection,
+        )
        from .models.clipseg import (
            CLIPSegForImageSegmentation,
            CLIPSegModel,
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@ -49,6 +49,7 @@ from . import (
    chinese_clip,
    clap,
    clip,
+    aimv2,
    clipseg,
    clvp,
    code_llama,
--- a/src/transformers/models/aimv2/init.py
+++ b/src/transformers/models/aimv2/init.py
@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_aimv2 import *
+    from .modeling_aimv2 import *
+    from .modeling_flax_aimv2 import *
+    from .modeling_tf_aimv2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
--- a/src/transformers/models/aimv2/configuration_aimv2.py
+++ b/src/transformers/models/aimv2/configuration_aimv2.py
@ -0,0 +1,422 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AIMv2 model configuration"""
+
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+    from ...utils import TensorType
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class AIMv2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a AIMv2
+    text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
+    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`AIMv2Model`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            End of stream token id.
+
+    Example:
+
+    ```python
+    >>> from transformers import AIMv2TextConfig, AIMv2TextModel
+
+    >>> # Initializing a AIMv2TextConfig with apple/aimv2-large-patch14-224 style configuration
+    >>> configuration = AIMv2TextConfig()
+
+    >>> # Initializing a AIMv2TextModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
+    >>> model = AIMv2TextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "aimv2_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/aimv2
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+
+class AIMv2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
+    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
+    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import AIMv2VisionConfig, AIMv2VisionModel
+
+    >>> # Initializing a AIMv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
+    >>> configuration = AIMv2VisionConfig()
+
+    >>> # Initializing a AIMv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
+    >>> model = AIMv2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "aimv2_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+
+class AIMv2Config(PretrainedConfig):
+    r"""
+    [`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to instantiate
+    a AIMv2 model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the AIMv2
+    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AIMv2TextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`AIMv2VisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original AIMv2 implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import AIMv2Config, AIMv2Model
+
+    >>> # Initializing a AIMv2Config with apple/aimv2-large-patch14-224 style configuration
+    >>> configuration = AIMv2Config()
+
+    >>> # Initializing a AIMv2Model (with random weights) from the apple/aimv2-large-patch14-224 style configuration
+    >>> model = AIMv2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a AIMv2Config from a AIMv2TextConfig and a AIMv2VisionConfig
+    >>> from transformers import AIMv2TextConfig, AIMv2VisionConfig
+
+    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
+    >>> config_text = AIMv2TextConfig()
+    >>> config_vision = AIMv2VisionConfig()
+
+    >>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "aimv2"
+    sub_configs = {"text_config": AIMv2TextConfig, "vision_config": AIMv2VisionConfig}
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = AIMv2TextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `AIMv2TextConfig`. The "
+                            f'value `text_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = AIMv2VisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `AIMv2VisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `AIMv2TextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `AIMv2VisionConfig` with default values.")
+
+        self.text_config = AIMv2TextConfig(**text_config)
+        self.vision_config = AIMv2VisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: AIMv2TextConfig, vision_config: AIMv2VisionConfig, **kwargs):
+        r"""
+        Instantiate a [`AIMv2Config`] (or a derived class) from aimv2 text model configuration and aimv2 vision model
+        configuration.
+
+        Returns:
+            [`AIMv2Config`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+
+class AIMv2OnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.image_processor, batch_size=batch_size, framework=framework
+        )
+        return {**text_input_dict, **image_input_dict}
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14
+
+
+__all__ = ["AIMv2Config", "AIMv2OnnxConfig", "AIMv2TextConfig", "AIMv2VisionConfig"]
--- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
+++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py
@ -0,0 +1,156 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+from aimv2 import load
+
+from transformers import AIMv2Config, AIMv2Model
+
+
+def copy_attn_layer(hf_attn_layer, pt_attn_layer):
+    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
+    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
+
+    out_proj_weights = pt_attn_layer.out_proj.weight
+    out_proj_bias = pt_attn_layer.out_proj.bias
+
+    hf_attn_layer.q_proj.weight.data = q_proj
+    hf_attn_layer.q_proj.bias.data = q_proj_bias
+
+    hf_attn_layer.k_proj.weight.data = k_proj
+    hf_attn_layer.k_proj.bias.data = k_proj_bias
+
+    hf_attn_layer.v_proj.weight.data = v_proj
+    hf_attn_layer.v_proj.bias.data = v_proj_bias
+
+    hf_attn_layer.out_proj.weight = out_proj_weights
+    hf_attn_layer.out_proj.bias = out_proj_bias
+
+
+def copy_mlp(hf_mlp, pt_mlp):
+    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
+    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
+
+
+def copy_linear(hf_linear, pt_linear):
+    hf_linear.weight = pt_linear.weight
+    hf_linear.bias = pt_linear.bias
+
+
+def copy_layer(hf_layer, pt_layer):
+    # copy layer norms
+    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
+    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
+
+    # copy MLP
+    copy_mlp(hf_layer.mlp, pt_layer.mlp)
+
+    # copy attn
+    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
+
+
+def copy_layers(hf_layers, pt_layers):
+    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
+        copy_layer(hf_layer, pt_layer)
+
+
+def copy_encoder(hf_encoder, pt_model):
+    # copy  embeds
+    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
+    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
+
+    # copy layer norm
+    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
+
+    # copy hidden layers
+    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
+
+
+def copy_text_model_and_projection(hf_model, pt_model):
+    # copy projection
+    hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous()
+
+    # copy text encoder
+    copy_encoder(hf_model.text_model, pt_model)
+
+
+def copy_vison_model_and_projection(hf_model, pt_model):
+    # copy projection
+    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous()
+
+    # copy layer norms
+    copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
+    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
+
+    # copy embeds
+    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
+    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
+    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
+
+    # copy encoder
+    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
+
+
+@torch.no_grad()
+def convert_aimv2_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = AIMv2Config.from_pretrained(config_path)
+    else:
+        config = AIMv2Config(projection_dim=512, text_config={}, vision_config={})
+
+    hf_model = AIMv2Model(config).eval()
+
+    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
+    pt_model = pt_model.eval()
+
+    copy_text_model_and_projection(hf_model, pt_model)
+    copy_vison_model_and_projection(hf_model, pt_model)
+    hf_model.logit_scale = pt_model.logit_scale
+
+    # Use `eos_token` so the example is more meaningful
+    input_ids = torch.tensor(
+        [
+            [config.text_config.bos_token_id]
+            + list(range(3, 77))
+            + [config.text_config.eos_token_id]
+            + [config.text_config.pad_token_id]
+        ]
+    )
+    pixel_values = torch.randn(1, 3, 224, 224)
+
+    hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
+    hf_logits_per_image = hf_outputs.logits_per_image
+    hf_logits_per_text = hf_outputs.logits_per_text
+    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
+
+    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
+    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
+
+    hf_model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    args = parser.parse_args()
+
+    convert_aimv2_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@ -64,6 +64,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
        ("chinese_clip_vision_model", "ChineseCLIPVisionConfig"),
        ("clap", "ClapConfig"),
        ("clip", "CLIPConfig"),
+        ("aimv2", "AIMv2Config"),
        ("clip_text_model", "CLIPTextConfig"),
        ("clip_vision_model", "CLIPVisionConfig"),
        ("clipseg", "CLIPSegConfig"),
@ -396,6 +397,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
        ("clap", "CLAP"),
        ("clip", "CLIP"),
+        ("aimv2", "AIMv2"),
        ("clip_text_model", "CLIPTextModel"),
        ("clip_vision_model", "CLIPVisionModel"),
        ("clipseg", "CLIPSeg"),
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@ -44,6 +44,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
        ("chinese_clip", "ChineseCLIPFeatureExtractor"),
        ("clap", "ClapFeatureExtractor"),
        ("clip", "CLIPFeatureExtractor"),
+        ("aimv2", "AIMv2FeatureExtractor"),
        ("clipseg", "ViTFeatureExtractor"),
        ("clvp", "ClvpFeatureExtractor"),
        ("conditional_detr", "ConditionalDetrFeatureExtractor"),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@ -64,6 +64,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
        ("clap", "ClapModel"),
        ("clip", "CLIPModel"),
+        ("aimv2", "AIMv2Model"),
        ("clip_text_model", "CLIPTextModel"),
        ("clip_vision_model", "CLIPVisionModel"),
        ("clipseg", "CLIPSegModel"),
@ -681,6 +682,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
        ("beit", "BeitForImageClassification"),
        ("bit", "BitForImageClassification"),
        ("clip", "CLIPForImageClassification"),
+        ("aimv2", "AIMv2ForImageClassification"),
        ("convnext", "ConvNextForImageClassification"),
        ("convnextv2", "ConvNextV2ForImageClassification"),
        ("cvt", "CvtForImageClassification"),
@ -1416,6 +1418,7 @@ MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
        ("blip-2", "Blip2ForImageTextRetrieval"),
        ("chinese_clip", "ChineseCLIPModel"),
        ("clip", "CLIPModel"),
+        ("aimv2", "AIMv2Model"),
        ("clipseg", "CLIPSegModel"),
        ("siglip", "SiglipModel"),
        ("siglip2", "Siglip2Model"),
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@ -57,6 +57,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("chinese_clip", "ChineseCLIPProcessor"),
        ("clap", "ClapProcessor"),
        ("clip", "CLIPProcessor"),
+        ("aimv2", "AIMv2Processor"),
        ("clipseg", "CLIPSegProcessor"),
        ("clvp", "ClvpProcessor"),
        ("colpali", "ColPaliProcessor"),
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@ -131,6 +131,13 @@ else:
                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
                ),
            ),
+            (
+                "aimv2",
+                (
+                    "CLIPTokenizer",
+                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
            (
                "clipseg",
                (
--- a/tests/models/aimv2/init.py
+++ b/tests/models/aimv2/init.py
--- a/tests/models/aimv2/test_modeling_aimv2.py
+++ b/tests/models/aimv2/test_modeling_aimv2.py