Model skelton

This commit is contained in:
yaswanth 2025-03-07 13:59:06 +05:30 committed by yaswant19
parent c9d1e5238a
commit 8198d49871
15 changed files with 1945 additions and 0 deletions

View File

@ -1,3 +1,4 @@
- sections:
- sections:
- local: index
title: Transformers
@ -6,8 +7,11 @@
- local: quicktour
title: Quickstart
title: Get started
- isExpanded: false
title: Get started
- isExpanded: false
sections:
- sections:
- sections:
- local: models
title: Loading models
@ -30,6 +34,8 @@
- local: attention
title: Attention mechanisms
title: Models
- sections:
title: Models
- sections:
- local: fast_tokenizers
title: Tokenizers
@ -47,8 +53,12 @@
title: Padding and truncation
title: Preprocessors
title: Base classes
- isExpanded: false
title: Preprocessors
title: Base classes
- isExpanded: false
sections:
- sections:
- sections:
- local: pipeline_tutorial
title: Pipeline
@ -59,6 +69,8 @@
- local: add_new_pipeline
title: Adding a new pipeline
title: Pipeline API
- sections:
title: Pipeline API
- sections:
- local: llm_tutorial
title: Text generation
@ -81,6 +93,8 @@
- local: perplexity
title: Perplexity of fixed-length models
title: LLMs
- sections:
title: LLMs
- sections:
- local: conversations
title: Chat basics
@ -93,6 +107,8 @@
- local: chat_extras
title: Tools and RAG
title: Chat with models
- sections:
title: Chat with models
- sections:
- local: perf_torch_compile
title: torch.compile
@ -105,13 +121,17 @@
- local: tf_xla
title: XLA
title: Optimization
title: Optimization
- local: agents
title: Agents
- local: tools
title: Tools
title: Inference
- isExpanded: false
title: Inference
- isExpanded: false
sections:
- sections:
- sections:
- local: trainer
title: Trainer
@ -122,6 +142,8 @@
- local: hpo_train
title: Hyperparameter search
title: Trainer API
- sections:
title: Trainer API
- sections:
- local: gpu_selection
title: GPU selection
@ -138,6 +160,8 @@
- local: perf_train_gpu_many
title: Parallelism methods
title: Distributed training
- sections:
title: Distributed training
- sections:
- local: perf_train_gpu_one
title: GPU
@ -150,11 +174,14 @@
- local: perf_hardware
title: Build your own machine
title: Hardware
title: Hardware
- local: peft
title: PEFT
- local: model_memory_anatomy
title: Model training anatomy
title: Training
- isExpanded: false
title: Training
- isExpanded: false
sections:
- local: quantization/overview
@ -198,6 +225,8 @@
- local: quantization/contribute
title: Contribute
title: Quantization
- isExpanded: false
title: Quantization
- isExpanded: false
sections:
- local: serialization
@ -209,8 +238,12 @@
- local: torchscript
title: TorchScript
title: Export to production
- isExpanded: false
title: Export to production
- isExpanded: false
sections:
- sections:
- sections:
- sections:
- sections:
- local: tasks/sequence_classification
@ -230,12 +263,16 @@
- local: tasks/multiple_choice
title: Multiple choice
title: Natural language processing
- sections:
title: Natural language processing
- sections:
- local: tasks/audio_classification
title: Audio classification
- local: tasks/asr
title: Automatic speech recognition
title: Audio
- sections:
title: Audio
- sections:
- local: tasks/image_classification
title: Image classification
@ -262,6 +299,8 @@
- local: tasks/knowledge_distillation_for_image_classification
title: Knowledge Distillation for Computer Vision
title: Computer vision
- sections:
title: Computer vision
- sections:
- local: tasks/image_captioning
title: Image captioning
@ -279,6 +318,8 @@
title: Video-text-to-text
title: Multimodal
title: Task recipes
title: Multimodal
title: Task recipes
- local: run_scripts
title: Training scripts
- local: glossary
@ -292,6 +333,8 @@
- local: troubleshooting
title: Troubleshoot
title: Resources
- isExpanded: false
title: Resources
- isExpanded: false
sections:
- local: contributing
@ -301,8 +344,11 @@
- local: pr_checks
title: Pull request checks
title: Contribute
- isExpanded: false
title: Contribute
- isExpanded: false
sections:
- sections:
- sections:
- local: main_classes/agent
title: Agents and Tools
@ -351,6 +397,9 @@
- local: main_classes/image_processor
title: Image Processor
title: Main classes
- sections:
- sections:
title: Main classes
- sections:
- sections:
- local: model_doc/albert
@ -664,6 +713,8 @@
- local: model_doc/zamba2
title: Zamba2
title: Text models
- sections:
title: Text models
- sections:
- local: model_doc/beit
title: BEiT
@ -794,6 +845,8 @@
- local: model_doc/zoedepth
title: ZoeDepth
title: Vision models
- sections:
title: Vision models
- sections:
- local: model_doc/audio-spectrogram-transformer
title: Audio Spectrogram Transformer
@ -864,6 +917,8 @@
- local: model_doc/xlsr_wav2vec2
title: XLSR-Wav2Vec2
title: Audio models
- sections:
title: Audio models
- sections:
- local: model_doc/timesformer
title: TimeSformer
@ -873,6 +928,8 @@
title: ViViT
title: Video models
- sections:
- local: model_doc/aimv2
title: AIMv2
- local: model_doc/align
title: ALIGN
- local: model_doc/altclip
@ -1020,12 +1077,16 @@
- local: model_doc/xclip
title: X-CLIP
title: Multimodal models
- sections:
title: Multimodal models
- sections:
- local: model_doc/decision_transformer
title: Decision Transformer
- local: model_doc/trajectory_transformer
title: Trajectory Transformer
title: Reinforcement learning models
- sections:
title: Reinforcement learning models
- sections:
- local: model_doc/autoformer
title: Autoformer
@ -1038,11 +1099,16 @@
- local: model_doc/time_series_transformer
title: Time Series Transformer
title: Time series models
- sections:
title: Time series models
- sections:
- local: model_doc/graphormer
title: Graphormer
title: Graph models
title: Models
- sections:
title: Graph models
title: Models
- sections:
- local: internal/modeling_utils
title: Custom Layers and Utilities
@ -1066,3 +1132,6 @@
title: Utilities for Time Series
title: Internal helpers
title: API
title: Internal helpers
title: API

View File

@ -0,0 +1,82 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# AIMv2
## Overview
The AIMv2 model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
<INSERT SHORT SUMMARY HERE>
The abstract from the paper is the following:
*<INSERT PAPER ABSTRACT HERE>*
Tips:
<INSERT TIPS ABOUT MODEL HERE>
This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
## AIMv2Config
[[autodoc]] AIMv2Config
- from_text_vision_configs
## AIMv2TextConfig
[[autodoc]] AIMv2TextConfig
## AIMv2VisionConfig
[[autodoc]] AIMv2VisionConfig
## AIMv2Model
[[autodoc]] AIMv2Model
- forward
- get_text_features
- get_image_features
## AIMv2TextModel
[[autodoc]] AIMv2TextModel
- forward
## AIMv2TextModelWithProjection
[[autodoc]] AIMv2TextModelWithProjection
- forward
## AIMv2VisionModelWithProjection
[[autodoc]] AIMv2VisionModelWithProjection
- forward
## AIMv2VisionModel
[[autodoc]] AIMv2VisionModel
- forward
## AIMv2ForImageClassification
[[autodoc]] AIMv2ForImageClassification
- forward
</pt>
<tf>

View File

@ -289,6 +289,11 @@ _import_structure = {
"CLIPTokenizer",
"CLIPVisionConfig",
],
"models.aimv2": [
"AIMv2Config",
"AIMv2TextConfig",
"AIMv2VisionConfig",
],
"models.clipseg": [
"CLIPSegConfig",
"CLIPSegProcessor",
@ -1852,6 +1857,17 @@ else:
"CLIPVisionModelWithProjection",
]
)
_import_structure["models.aimv2"].extend(
[
"AIMv2ForImageClassification",
"AIMv2Model",
"AIMv2PreTrainedModel",
"AIMv2TextModel",
"AIMv2TextModelWithProjection",
"AIMv2VisionModel",
"AIMv2VisionModelWithProjection",
]
)
_import_structure["models.clipseg"].extend(
[
"CLIPSegForImageSegmentation",
@ -5459,6 +5475,13 @@ if TYPE_CHECKING:
CLIPTokenizer,
CLIPVisionConfig,
)
from .models.aimv2 import (
AIMv2Config,
AIMv2TextConfig,
AIMv2VisionConfig,
)
from .models.clipseg import (
CLIPSegConfig,
CLIPSegProcessor,
@ -7010,6 +7033,15 @@ if TYPE_CHECKING:
CLIPVisionModel,
CLIPVisionModelWithProjection,
)
from .models.aimv2 import (
AIMv2ForImageClassification,
AIMv2Model,
AIMv2PreTrainedModel,
AIMv2TextModel,
AIMv2TextModelWithProjection,
AIMv2VisionModel,
AIMv2VisionModelWithProjection,
)
from .models.clipseg import (
CLIPSegForImageSegmentation,
CLIPSegModel,

View File

@ -49,6 +49,7 @@ from . import (
chinese_clip,
clap,
clip,
aimv2,
clipseg,
clvp,
code_llama,

View File

@ -0,0 +1,29 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_aimv2 import *
from .modeling_aimv2 import *
from .modeling_flax_aimv2 import *
from .modeling_tf_aimv2 import *
else:
import sys
_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

View File

@ -0,0 +1,422 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""AIMv2 model configuration"""
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional
if TYPE_CHECKING:
from ...processing_utils import ProcessorMixin
from ...utils import TensorType
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
class AIMv2TextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a AIMv2
text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 49408):
Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
the `inputs_ids` passed when calling [`AIMv2Model`].
hidden_size (`int`, *optional*, defaults to 512):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
projection_dim (`int`, *optional*, defaults to 512):
Dimensionality of text and vision projection layers.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
max_position_embeddings (`int`, *optional*, defaults to 77):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1.0):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
pad_token_id (`int`, *optional*, defaults to 1):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 49406):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 49407):
End of stream token id.
Example:
```python
>>> from transformers import AIMv2TextConfig, AIMv2TextModel
>>> # Initializing a AIMv2TextConfig with apple/aimv2-large-patch14-224 style configuration
>>> configuration = AIMv2TextConfig()
>>> # Initializing a AIMv2TextModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
>>> model = AIMv2TextModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "aimv2_text_model"
base_config_key = "text_config"
def __init__(
self,
vocab_size=49408,
hidden_size=512,
intermediate_size=2048,
projection_dim=512,
num_hidden_layers=12,
num_attention_heads=8,
max_position_embeddings=77,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
# This differs from `CLIPTokenizer`'s default and from openai/aimv2
# See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
pad_token_id=1,
bos_token_id=49406,
eos_token_id=49407,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.max_position_embeddings = max_position_embeddings
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
class AIMv2VisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
projection_dim (`int`, *optional*, defaults to 512):
Dimensionality of text and vision projection layers.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 32):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1.0):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
Example:
```python
>>> from transformers import AIMv2VisionConfig, AIMv2VisionModel
>>> # Initializing a AIMv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
>>> configuration = AIMv2VisionConfig()
>>> # Initializing a AIMv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
>>> model = AIMv2VisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "aimv2_vision_model"
base_config_key = "vision_config"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
projection_dim=512,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=224,
patch_size=32,
hidden_act="quick_gelu",
layer_norm_eps=1e-5,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
class AIMv2Config(PretrainedConfig):
r"""
[`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to instantiate
a AIMv2 model according to the specified arguments, defining the text model and vision model configs. Instantiating
a configuration with the defaults will yield a similar configuration to that of the AIMv2
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`AIMv2TextConfig`].
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`AIMv2VisionConfig`].
projection_dim (`int`, *optional*, defaults to 512):
Dimensionality of text and vision projection layers.
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The initial value of the *logit_scale* parameter. Default is used as per the original AIMv2 implementation.
kwargs (*optional*):
Dictionary of keyword arguments.
Example:
```python
>>> from transformers import AIMv2Config, AIMv2Model
>>> # Initializing a AIMv2Config with apple/aimv2-large-patch14-224 style configuration
>>> configuration = AIMv2Config()
>>> # Initializing a AIMv2Model (with random weights) from the apple/aimv2-large-patch14-224 style configuration
>>> model = AIMv2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
>>> # We can also initialize a AIMv2Config from a AIMv2TextConfig and a AIMv2VisionConfig
>>> from transformers import AIMv2TextConfig, AIMv2VisionConfig
>>> # Initializing a AIMv2Text and AIMv2Vision configuration
>>> config_text = AIMv2TextConfig()
>>> config_vision = AIMv2VisionConfig()
>>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
```"""
model_type = "aimv2"
sub_configs = {"text_config": AIMv2TextConfig, "vision_config": AIMv2VisionConfig}
def __init__(
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
):
# If `_config_dict` exist, we use them for the backward compatibility.
# We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
# of confusion!).
text_config_dict = kwargs.pop("text_config_dict", None)
vision_config_dict = kwargs.pop("vision_config_dict", None)
super().__init__(**kwargs)
# Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
# `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
# cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
if text_config_dict is not None:
if text_config is None:
text_config = {}
# This is the complete result when using `text_config_dict`.
_text_config_dict = AIMv2TextConfig(**text_config_dict).to_dict()
# Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
for key, value in _text_config_dict.items():
if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
# If specified in `text_config_dict`
if key in text_config_dict:
message = (
f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
f'The value `text_config_dict["{key}"]` will be used instead.'
)
# If inferred from default argument values (just to be super careful)
else:
message = (
f"`text_config_dict` is provided which will be used to initialize `AIMv2TextConfig`. The "
f'value `text_config["{key}"]` will be overridden.'
)
logger.info(message)
# Update all values in `text_config` with the ones in `_text_config_dict`.
text_config.update(_text_config_dict)
if vision_config_dict is not None:
if vision_config is None:
vision_config = {}
# This is the complete result when using `vision_config_dict`.
_vision_config_dict = AIMv2VisionConfig(**vision_config_dict).to_dict()
# convert keys to string instead of integer
if "id2label" in _vision_config_dict:
_vision_config_dict["id2label"] = {
str(key): value for key, value in _vision_config_dict["id2label"].items()
}
# Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
for key, value in _vision_config_dict.items():
if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
# If specified in `vision_config_dict`
if key in vision_config_dict:
message = (
f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
f'values. The value `vision_config_dict["{key}"]` will be used instead.'
)
# If inferred from default argument values (just to be super careful)
else:
message = (
f"`vision_config_dict` is provided which will be used to initialize `AIMv2VisionConfig`. "
f'The value `vision_config["{key}"]` will be overridden.'
)
logger.info(message)
# Update all values in `vision_config` with the ones in `_vision_config_dict`.
vision_config.update(_vision_config_dict)
if text_config is None:
text_config = {}
logger.info("`text_config` is `None`. Initializing the `AIMv2TextConfig` with default values.")
if vision_config is None:
vision_config = {}
logger.info("`vision_config` is `None`. initializing the `AIMv2VisionConfig` with default values.")
self.text_config = AIMv2TextConfig(**text_config)
self.vision_config = AIMv2VisionConfig(**vision_config)
self.projection_dim = projection_dim
self.logit_scale_init_value = logit_scale_init_value
self.initializer_factor = 1.0
@classmethod
def from_text_vision_configs(cls, text_config: AIMv2TextConfig, vision_config: AIMv2VisionConfig, **kwargs):
r"""
Instantiate a [`AIMv2Config`] (or a derived class) from aimv2 text model configuration and aimv2 vision model
configuration.
Returns:
[`AIMv2Config`]: An instance of a configuration object
"""
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
class AIMv2OnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("input_ids", {0: "batch", 1: "sequence"}),
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
("attention_mask", {0: "batch", 1: "sequence"}),
]
)
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
return OrderedDict(
[
("logits_per_image", {0: "batch"}),
("logits_per_text", {0: "batch"}),
("text_embeds", {0: "batch"}),
("image_embeds", {0: "batch"}),
]
)
@property
def atol_for_validation(self) -> float:
return 1e-4
def generate_dummy_inputs(
self,
processor: "ProcessorMixin",
batch_size: int = -1,
seq_length: int = -1,
framework: Optional["TensorType"] = None,
) -> Mapping[str, Any]:
text_input_dict = super().generate_dummy_inputs(
processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
)
image_input_dict = super().generate_dummy_inputs(
processor.image_processor, batch_size=batch_size, framework=framework
)
return {**text_input_dict, **image_input_dict}
@property
def default_onnx_opset(self) -> int:
return 14
__all__ = ["AIMv2Config", "AIMv2OnnxConfig", "AIMv2TextConfig", "AIMv2VisionConfig"]

View File

@ -0,0 +1,156 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import torch
from aimv2 import load
from transformers import AIMv2Config, AIMv2Model
def copy_attn_layer(hf_attn_layer, pt_attn_layer):
q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
out_proj_weights = pt_attn_layer.out_proj.weight
out_proj_bias = pt_attn_layer.out_proj.bias
hf_attn_layer.q_proj.weight.data = q_proj
hf_attn_layer.q_proj.bias.data = q_proj_bias
hf_attn_layer.k_proj.weight.data = k_proj
hf_attn_layer.k_proj.bias.data = k_proj_bias
hf_attn_layer.v_proj.weight.data = v_proj
hf_attn_layer.v_proj.bias.data = v_proj_bias
hf_attn_layer.out_proj.weight = out_proj_weights
hf_attn_layer.out_proj.bias = out_proj_bias
def copy_mlp(hf_mlp, pt_mlp):
copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
def copy_linear(hf_linear, pt_linear):
hf_linear.weight = pt_linear.weight
hf_linear.bias = pt_linear.bias
def copy_layer(hf_layer, pt_layer):
# copy layer norms
copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
# copy MLP
copy_mlp(hf_layer.mlp, pt_layer.mlp)
# copy attn
copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
def copy_layers(hf_layers, pt_layers):
for hf_layer, pt_layer in zip(hf_layers, pt_layers):
copy_layer(hf_layer, pt_layer)
def copy_encoder(hf_encoder, pt_model):
# copy embeds
hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
# copy layer norm
copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
# copy hidden layers
copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
def copy_text_model_and_projection(hf_model, pt_model):
# copy projection
hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous()
# copy text encoder
copy_encoder(hf_model.text_model, pt_model)
def copy_vison_model_and_projection(hf_model, pt_model):
# copy projection
hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous()
# copy layer norms
copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
# copy embeds
hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
# copy encoder
copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
@torch.no_grad()
def convert_aimv2_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
"""
Copy/paste/tweak model's weights to transformers design.
"""
if config_path is not None:
config = AIMv2Config.from_pretrained(config_path)
else:
config = AIMv2Config(projection_dim=512, text_config={}, vision_config={})
hf_model = AIMv2Model(config).eval()
pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
pt_model = pt_model.eval()
copy_text_model_and_projection(hf_model, pt_model)
copy_vison_model_and_projection(hf_model, pt_model)
hf_model.logit_scale = pt_model.logit_scale
# Use `eos_token` so the example is more meaningful
input_ids = torch.tensor(
[
[config.text_config.bos_token_id]
+ list(range(3, 77))
+ [config.text_config.eos_token_id]
+ [config.text_config.pad_token_id]
]
)
pixel_values = torch.randn(1, 3, 224, 224)
hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
hf_logits_per_image = hf_outputs.logits_per_image
hf_logits_per_text = hf_outputs.logits_per_text
pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
hf_model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
args = parser.parse_args()
convert_aimv2_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)

View File

@ -64,6 +64,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("chinese_clip_vision_model", "ChineseCLIPVisionConfig"),
("clap", "ClapConfig"),
("clip", "CLIPConfig"),
("aimv2", "AIMv2Config"),
("clip_text_model", "CLIPTextConfig"),
("clip_vision_model", "CLIPVisionConfig"),
("clipseg", "CLIPSegConfig"),
@ -396,6 +397,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
("clap", "CLAP"),
("clip", "CLIP"),
("aimv2", "AIMv2"),
("clip_text_model", "CLIPTextModel"),
("clip_vision_model", "CLIPVisionModel"),
("clipseg", "CLIPSeg"),

View File

@ -44,6 +44,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
("chinese_clip", "ChineseCLIPFeatureExtractor"),
("clap", "ClapFeatureExtractor"),
("clip", "CLIPFeatureExtractor"),
("aimv2", "AIMv2FeatureExtractor"),
("clipseg", "ViTFeatureExtractor"),
("clvp", "ClvpFeatureExtractor"),
("conditional_detr", "ConditionalDetrFeatureExtractor"),

View File

@ -64,6 +64,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
("clap", "ClapModel"),
("clip", "CLIPModel"),
("aimv2", "AIMv2Model"),
("clip_text_model", "CLIPTextModel"),
("clip_vision_model", "CLIPVisionModel"),
("clipseg", "CLIPSegModel"),
@ -681,6 +682,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("beit", "BeitForImageClassification"),
("bit", "BitForImageClassification"),
("clip", "CLIPForImageClassification"),
("aimv2", "AIMv2ForImageClassification"),
("convnext", "ConvNextForImageClassification"),
("convnextv2", "ConvNextV2ForImageClassification"),
("cvt", "CvtForImageClassification"),
@ -1416,6 +1418,7 @@ MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("blip-2", "Blip2ForImageTextRetrieval"),
("chinese_clip", "ChineseCLIPModel"),
("clip", "CLIPModel"),
("aimv2", "AIMv2Model"),
("clipseg", "CLIPSegModel"),
("siglip", "SiglipModel"),
("siglip2", "Siglip2Model"),

View File

@ -57,6 +57,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("chinese_clip", "ChineseCLIPProcessor"),
("clap", "ClapProcessor"),
("clip", "CLIPProcessor"),
("aimv2", "AIMv2Processor"),
("clipseg", "CLIPSegProcessor"),
("clvp", "ClvpProcessor"),
("colpali", "ColPaliProcessor"),

View File

@ -131,6 +131,13 @@ else:
"CLIPTokenizerFast" if is_tokenizers_available() else None,
),
),
(
"aimv2",
(
"CLIPTokenizer",
"CLIPTokenizerFast" if is_tokenizers_available() else None,
),
),
(
"clipseg",
(

View File

File diff suppressed because it is too large Load Diff