mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-03 03:31:05 +06:00
Model skelton
This commit is contained in:
parent
c9d1e5238a
commit
8198d49871
@ -1,3 +1,4 @@
|
||||
- sections:
|
||||
- sections:
|
||||
- local: index
|
||||
title: Transformers
|
||||
@ -6,8 +7,11 @@
|
||||
- local: quicktour
|
||||
title: Quickstart
|
||||
title: Get started
|
||||
- isExpanded: false
|
||||
title: Get started
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- sections:
|
||||
- sections:
|
||||
- local: models
|
||||
title: Loading models
|
||||
@ -30,6 +34,8 @@
|
||||
- local: attention
|
||||
title: Attention mechanisms
|
||||
title: Models
|
||||
- sections:
|
||||
title: Models
|
||||
- sections:
|
||||
- local: fast_tokenizers
|
||||
title: Tokenizers
|
||||
@ -47,8 +53,12 @@
|
||||
title: Padding and truncation
|
||||
title: Preprocessors
|
||||
title: Base classes
|
||||
- isExpanded: false
|
||||
title: Preprocessors
|
||||
title: Base classes
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- sections:
|
||||
- sections:
|
||||
- local: pipeline_tutorial
|
||||
title: Pipeline
|
||||
@ -59,6 +69,8 @@
|
||||
- local: add_new_pipeline
|
||||
title: Adding a new pipeline
|
||||
title: Pipeline API
|
||||
- sections:
|
||||
title: Pipeline API
|
||||
- sections:
|
||||
- local: llm_tutorial
|
||||
title: Text generation
|
||||
@ -81,6 +93,8 @@
|
||||
- local: perplexity
|
||||
title: Perplexity of fixed-length models
|
||||
title: LLMs
|
||||
- sections:
|
||||
title: LLMs
|
||||
- sections:
|
||||
- local: conversations
|
||||
title: Chat basics
|
||||
@ -93,6 +107,8 @@
|
||||
- local: chat_extras
|
||||
title: Tools and RAG
|
||||
title: Chat with models
|
||||
- sections:
|
||||
title: Chat with models
|
||||
- sections:
|
||||
- local: perf_torch_compile
|
||||
title: torch.compile
|
||||
@ -105,13 +121,17 @@
|
||||
- local: tf_xla
|
||||
title: XLA
|
||||
title: Optimization
|
||||
title: Optimization
|
||||
- local: agents
|
||||
title: Agents
|
||||
- local: tools
|
||||
title: Tools
|
||||
title: Inference
|
||||
- isExpanded: false
|
||||
title: Inference
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- sections:
|
||||
- sections:
|
||||
- local: trainer
|
||||
title: Trainer
|
||||
@ -122,6 +142,8 @@
|
||||
- local: hpo_train
|
||||
title: Hyperparameter search
|
||||
title: Trainer API
|
||||
- sections:
|
||||
title: Trainer API
|
||||
- sections:
|
||||
- local: gpu_selection
|
||||
title: GPU selection
|
||||
@ -138,6 +160,8 @@
|
||||
- local: perf_train_gpu_many
|
||||
title: Parallelism methods
|
||||
title: Distributed training
|
||||
- sections:
|
||||
title: Distributed training
|
||||
- sections:
|
||||
- local: perf_train_gpu_one
|
||||
title: GPU
|
||||
@ -150,11 +174,14 @@
|
||||
- local: perf_hardware
|
||||
title: Build your own machine
|
||||
title: Hardware
|
||||
title: Hardware
|
||||
- local: peft
|
||||
title: PEFT
|
||||
- local: model_memory_anatomy
|
||||
title: Model training anatomy
|
||||
title: Training
|
||||
- isExpanded: false
|
||||
title: Training
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: quantization/overview
|
||||
@ -198,6 +225,8 @@
|
||||
- local: quantization/contribute
|
||||
title: Contribute
|
||||
title: Quantization
|
||||
- isExpanded: false
|
||||
title: Quantization
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: serialization
|
||||
@ -209,8 +238,12 @@
|
||||
- local: torchscript
|
||||
title: TorchScript
|
||||
title: Export to production
|
||||
- isExpanded: false
|
||||
title: Export to production
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- sections:
|
||||
- sections:
|
||||
- sections:
|
||||
- sections:
|
||||
- local: tasks/sequence_classification
|
||||
@ -230,12 +263,16 @@
|
||||
- local: tasks/multiple_choice
|
||||
title: Multiple choice
|
||||
title: Natural language processing
|
||||
- sections:
|
||||
title: Natural language processing
|
||||
- sections:
|
||||
- local: tasks/audio_classification
|
||||
title: Audio classification
|
||||
- local: tasks/asr
|
||||
title: Automatic speech recognition
|
||||
title: Audio
|
||||
- sections:
|
||||
title: Audio
|
||||
- sections:
|
||||
- local: tasks/image_classification
|
||||
title: Image classification
|
||||
@ -262,6 +299,8 @@
|
||||
- local: tasks/knowledge_distillation_for_image_classification
|
||||
title: Knowledge Distillation for Computer Vision
|
||||
title: Computer vision
|
||||
- sections:
|
||||
title: Computer vision
|
||||
- sections:
|
||||
- local: tasks/image_captioning
|
||||
title: Image captioning
|
||||
@ -279,6 +318,8 @@
|
||||
title: Video-text-to-text
|
||||
title: Multimodal
|
||||
title: Task recipes
|
||||
title: Multimodal
|
||||
title: Task recipes
|
||||
- local: run_scripts
|
||||
title: Training scripts
|
||||
- local: glossary
|
||||
@ -292,6 +333,8 @@
|
||||
- local: troubleshooting
|
||||
title: Troubleshoot
|
||||
title: Resources
|
||||
- isExpanded: false
|
||||
title: Resources
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- local: contributing
|
||||
@ -301,8 +344,11 @@
|
||||
- local: pr_checks
|
||||
title: Pull request checks
|
||||
title: Contribute
|
||||
- isExpanded: false
|
||||
title: Contribute
|
||||
- isExpanded: false
|
||||
sections:
|
||||
- sections:
|
||||
- sections:
|
||||
- local: main_classes/agent
|
||||
title: Agents and Tools
|
||||
@ -351,6 +397,9 @@
|
||||
- local: main_classes/image_processor
|
||||
title: Image Processor
|
||||
title: Main classes
|
||||
- sections:
|
||||
- sections:
|
||||
title: Main classes
|
||||
- sections:
|
||||
- sections:
|
||||
- local: model_doc/albert
|
||||
@ -664,6 +713,8 @@
|
||||
- local: model_doc/zamba2
|
||||
title: Zamba2
|
||||
title: Text models
|
||||
- sections:
|
||||
title: Text models
|
||||
- sections:
|
||||
- local: model_doc/beit
|
||||
title: BEiT
|
||||
@ -794,6 +845,8 @@
|
||||
- local: model_doc/zoedepth
|
||||
title: ZoeDepth
|
||||
title: Vision models
|
||||
- sections:
|
||||
title: Vision models
|
||||
- sections:
|
||||
- local: model_doc/audio-spectrogram-transformer
|
||||
title: Audio Spectrogram Transformer
|
||||
@ -864,6 +917,8 @@
|
||||
- local: model_doc/xlsr_wav2vec2
|
||||
title: XLSR-Wav2Vec2
|
||||
title: Audio models
|
||||
- sections:
|
||||
title: Audio models
|
||||
- sections:
|
||||
- local: model_doc/timesformer
|
||||
title: TimeSformer
|
||||
@ -873,6 +928,8 @@
|
||||
title: ViViT
|
||||
title: Video models
|
||||
- sections:
|
||||
- local: model_doc/aimv2
|
||||
title: AIMv2
|
||||
- local: model_doc/align
|
||||
title: ALIGN
|
||||
- local: model_doc/altclip
|
||||
@ -1020,12 +1077,16 @@
|
||||
- local: model_doc/xclip
|
||||
title: X-CLIP
|
||||
title: Multimodal models
|
||||
- sections:
|
||||
title: Multimodal models
|
||||
- sections:
|
||||
- local: model_doc/decision_transformer
|
||||
title: Decision Transformer
|
||||
- local: model_doc/trajectory_transformer
|
||||
title: Trajectory Transformer
|
||||
title: Reinforcement learning models
|
||||
- sections:
|
||||
title: Reinforcement learning models
|
||||
- sections:
|
||||
- local: model_doc/autoformer
|
||||
title: Autoformer
|
||||
@ -1038,11 +1099,16 @@
|
||||
- local: model_doc/time_series_transformer
|
||||
title: Time Series Transformer
|
||||
title: Time series models
|
||||
- sections:
|
||||
title: Time series models
|
||||
- sections:
|
||||
- local: model_doc/graphormer
|
||||
title: Graphormer
|
||||
title: Graph models
|
||||
title: Models
|
||||
- sections:
|
||||
title: Graph models
|
||||
title: Models
|
||||
- sections:
|
||||
- local: internal/modeling_utils
|
||||
title: Custom Layers and Utilities
|
||||
@ -1066,3 +1132,6 @@
|
||||
title: Utilities for Time Series
|
||||
title: Internal helpers
|
||||
title: API
|
||||
|
||||
title: Internal helpers
|
||||
title: API
|
||||
|
82
docs/source/en/model_doc/aimv2.md
Normal file
82
docs/source/en/model_doc/aimv2.md
Normal file
@ -0,0 +1,82 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# AIMv2
|
||||
|
||||
## Overview
|
||||
|
||||
The AIMv2 model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
|
||||
<INSERT SHORT SUMMARY HERE>
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*<INSERT PAPER ABSTRACT HERE>*
|
||||
|
||||
Tips:
|
||||
|
||||
<INSERT TIPS ABOUT MODEL HERE>
|
||||
|
||||
This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
|
||||
The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
|
||||
|
||||
|
||||
## AIMv2Config
|
||||
|
||||
[[autodoc]] AIMv2Config
|
||||
- from_text_vision_configs
|
||||
|
||||
## AIMv2TextConfig
|
||||
|
||||
[[autodoc]] AIMv2TextConfig
|
||||
|
||||
## AIMv2VisionConfig
|
||||
|
||||
[[autodoc]] AIMv2VisionConfig
|
||||
|
||||
## AIMv2Model
|
||||
|
||||
[[autodoc]] AIMv2Model
|
||||
- forward
|
||||
- get_text_features
|
||||
- get_image_features
|
||||
|
||||
## AIMv2TextModel
|
||||
|
||||
[[autodoc]] AIMv2TextModel
|
||||
- forward
|
||||
|
||||
## AIMv2TextModelWithProjection
|
||||
|
||||
[[autodoc]] AIMv2TextModelWithProjection
|
||||
- forward
|
||||
|
||||
## AIMv2VisionModelWithProjection
|
||||
|
||||
[[autodoc]] AIMv2VisionModelWithProjection
|
||||
- forward
|
||||
|
||||
## AIMv2VisionModel
|
||||
|
||||
[[autodoc]] AIMv2VisionModel
|
||||
- forward
|
||||
|
||||
## AIMv2ForImageClassification
|
||||
|
||||
[[autodoc]] AIMv2ForImageClassification
|
||||
- forward
|
||||
|
||||
</pt>
|
||||
<tf>
|
@ -289,6 +289,11 @@ _import_structure = {
|
||||
"CLIPTokenizer",
|
||||
"CLIPVisionConfig",
|
||||
],
|
||||
"models.aimv2": [
|
||||
"AIMv2Config",
|
||||
"AIMv2TextConfig",
|
||||
"AIMv2VisionConfig",
|
||||
],
|
||||
"models.clipseg": [
|
||||
"CLIPSegConfig",
|
||||
"CLIPSegProcessor",
|
||||
@ -1852,6 +1857,17 @@ else:
|
||||
"CLIPVisionModelWithProjection",
|
||||
]
|
||||
)
|
||||
_import_structure["models.aimv2"].extend(
|
||||
[
|
||||
"AIMv2ForImageClassification",
|
||||
"AIMv2Model",
|
||||
"AIMv2PreTrainedModel",
|
||||
"AIMv2TextModel",
|
||||
"AIMv2TextModelWithProjection",
|
||||
"AIMv2VisionModel",
|
||||
"AIMv2VisionModelWithProjection",
|
||||
]
|
||||
)
|
||||
_import_structure["models.clipseg"].extend(
|
||||
[
|
||||
"CLIPSegForImageSegmentation",
|
||||
@ -5459,6 +5475,13 @@ if TYPE_CHECKING:
|
||||
CLIPTokenizer,
|
||||
CLIPVisionConfig,
|
||||
)
|
||||
from .models.aimv2 import (
|
||||
AIMv2Config,
|
||||
|
||||
AIMv2TextConfig,
|
||||
|
||||
AIMv2VisionConfig,
|
||||
)
|
||||
from .models.clipseg import (
|
||||
CLIPSegConfig,
|
||||
CLIPSegProcessor,
|
||||
@ -7010,6 +7033,15 @@ if TYPE_CHECKING:
|
||||
CLIPVisionModel,
|
||||
CLIPVisionModelWithProjection,
|
||||
)
|
||||
from .models.aimv2 import (
|
||||
AIMv2ForImageClassification,
|
||||
AIMv2Model,
|
||||
AIMv2PreTrainedModel,
|
||||
AIMv2TextModel,
|
||||
AIMv2TextModelWithProjection,
|
||||
AIMv2VisionModel,
|
||||
AIMv2VisionModelWithProjection,
|
||||
)
|
||||
from .models.clipseg import (
|
||||
CLIPSegForImageSegmentation,
|
||||
CLIPSegModel,
|
||||
|
@ -49,6 +49,7 @@ from . import (
|
||||
chinese_clip,
|
||||
clap,
|
||||
clip,
|
||||
aimv2,
|
||||
clipseg,
|
||||
clvp,
|
||||
code_llama,
|
||||
|
29
src/transformers/models/aimv2/__init__.py
Normal file
29
src/transformers/models/aimv2/__init__.py
Normal file
@ -0,0 +1,29 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_aimv2 import *
|
||||
from .modeling_aimv2 import *
|
||||
from .modeling_flax_aimv2 import *
|
||||
from .modeling_tf_aimv2 import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
422
src/transformers/models/aimv2/configuration_aimv2.py
Normal file
422
src/transformers/models/aimv2/configuration_aimv2.py
Normal file
@ -0,0 +1,422 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""AIMv2 model configuration"""
|
||||
|
||||
from collections import OrderedDict
|
||||
from typing import TYPE_CHECKING, Any, Mapping, Optional
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ...processing_utils import ProcessorMixin
|
||||
from ...utils import TensorType
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...onnx import OnnxConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class AIMv2TextConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`AIMv2TextModel`]. It is used to instantiate a AIMv2
|
||||
text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
|
||||
with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
|
||||
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 49408):
|
||||
Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
|
||||
the `inputs_ids` passed when calling [`AIMv2Model`].
|
||||
hidden_size (`int`, *optional*, defaults to 512):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
intermediate_size (`int`, *optional*, defaults to 2048):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
projection_dim (`int`, *optional*, defaults to 512):
|
||||
Dimensionality of text and vision projection layers.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 8):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 77):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
initializer_factor (`float`, *optional*, defaults to 1.0):
|
||||
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||
testing).
|
||||
pad_token_id (`int`, *optional*, defaults to 1):
|
||||
Padding token id.
|
||||
bos_token_id (`int`, *optional*, defaults to 49406):
|
||||
Beginning of stream token id.
|
||||
eos_token_id (`int`, *optional*, defaults to 49407):
|
||||
End of stream token id.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AIMv2TextConfig, AIMv2TextModel
|
||||
|
||||
>>> # Initializing a AIMv2TextConfig with apple/aimv2-large-patch14-224 style configuration
|
||||
>>> configuration = AIMv2TextConfig()
|
||||
|
||||
>>> # Initializing a AIMv2TextModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
|
||||
>>> model = AIMv2TextModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "aimv2_text_model"
|
||||
base_config_key = "text_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=49408,
|
||||
hidden_size=512,
|
||||
intermediate_size=2048,
|
||||
projection_dim=512,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=8,
|
||||
max_position_embeddings=77,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=1e-5,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
initializer_factor=1.0,
|
||||
# This differs from `CLIPTokenizer`'s default and from openai/aimv2
|
||||
# See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
|
||||
pad_token_id=1,
|
||||
bos_token_id=49406,
|
||||
eos_token_id=49407,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.projection_dim = projection_dim
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
|
||||
class AIMv2VisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`AIMv2VisionModel`]. It is used to instantiate a
|
||||
AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
|
||||
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
hidden_size (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
intermediate_size (`int`, *optional*, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
projection_dim (`int`, *optional*, defaults to 512):
|
||||
Dimensionality of text and vision projection layers.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_channels (`int`, *optional*, defaults to 3):
|
||||
The number of input channels.
|
||||
image_size (`int`, *optional*, defaults to 224):
|
||||
The size (resolution) of each image.
|
||||
patch_size (`int`, *optional*, defaults to 32):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
initializer_factor (`float`, *optional*, defaults to 1.0):
|
||||
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||
testing).
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AIMv2VisionConfig, AIMv2VisionModel
|
||||
|
||||
>>> # Initializing a AIMv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
|
||||
>>> configuration = AIMv2VisionConfig()
|
||||
|
||||
>>> # Initializing a AIMv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
|
||||
>>> model = AIMv2VisionModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "aimv2_vision_model"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=768,
|
||||
intermediate_size=3072,
|
||||
projection_dim=512,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
num_channels=3,
|
||||
image_size=224,
|
||||
patch_size=32,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=1e-5,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
initializer_factor=1.0,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.projection_dim = projection_dim
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
|
||||
|
||||
class AIMv2Config(PretrainedConfig):
|
||||
r"""
|
||||
[`AIMv2Config`] is the configuration class to store the configuration of a [`AIMv2Model`]. It is used to instantiate
|
||||
a AIMv2 model according to the specified arguments, defining the text model and vision model configs. Instantiating
|
||||
a configuration with the defaults will yield a similar configuration to that of the AIMv2
|
||||
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
Args:
|
||||
text_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize [`AIMv2TextConfig`].
|
||||
vision_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize [`AIMv2VisionConfig`].
|
||||
projection_dim (`int`, *optional*, defaults to 512):
|
||||
Dimensionality of text and vision projection layers.
|
||||
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
|
||||
The initial value of the *logit_scale* parameter. Default is used as per the original AIMv2 implementation.
|
||||
kwargs (*optional*):
|
||||
Dictionary of keyword arguments.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AIMv2Config, AIMv2Model
|
||||
|
||||
>>> # Initializing a AIMv2Config with apple/aimv2-large-patch14-224 style configuration
|
||||
>>> configuration = AIMv2Config()
|
||||
|
||||
>>> # Initializing a AIMv2Model (with random weights) from the apple/aimv2-large-patch14-224 style configuration
|
||||
>>> model = AIMv2Model(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
|
||||
>>> # We can also initialize a AIMv2Config from a AIMv2TextConfig and a AIMv2VisionConfig
|
||||
>>> from transformers import AIMv2TextConfig, AIMv2VisionConfig
|
||||
|
||||
>>> # Initializing a AIMv2Text and AIMv2Vision configuration
|
||||
>>> config_text = AIMv2TextConfig()
|
||||
>>> config_vision = AIMv2VisionConfig()
|
||||
|
||||
>>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
|
||||
```"""
|
||||
|
||||
model_type = "aimv2"
|
||||
sub_configs = {"text_config": AIMv2TextConfig, "vision_config": AIMv2VisionConfig}
|
||||
|
||||
def __init__(
|
||||
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
||||
):
|
||||
# If `_config_dict` exist, we use them for the backward compatibility.
|
||||
# We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
|
||||
# of confusion!).
|
||||
text_config_dict = kwargs.pop("text_config_dict", None)
|
||||
vision_config_dict = kwargs.pop("vision_config_dict", None)
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
# Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
|
||||
# `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
|
||||
# cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
|
||||
if text_config_dict is not None:
|
||||
if text_config is None:
|
||||
text_config = {}
|
||||
|
||||
# This is the complete result when using `text_config_dict`.
|
||||
_text_config_dict = AIMv2TextConfig(**text_config_dict).to_dict()
|
||||
|
||||
# Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
|
||||
for key, value in _text_config_dict.items():
|
||||
if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
|
||||
# If specified in `text_config_dict`
|
||||
if key in text_config_dict:
|
||||
message = (
|
||||
f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
|
||||
f'The value `text_config_dict["{key}"]` will be used instead.'
|
||||
)
|
||||
# If inferred from default argument values (just to be super careful)
|
||||
else:
|
||||
message = (
|
||||
f"`text_config_dict` is provided which will be used to initialize `AIMv2TextConfig`. The "
|
||||
f'value `text_config["{key}"]` will be overridden.'
|
||||
)
|
||||
logger.info(message)
|
||||
|
||||
# Update all values in `text_config` with the ones in `_text_config_dict`.
|
||||
text_config.update(_text_config_dict)
|
||||
|
||||
if vision_config_dict is not None:
|
||||
if vision_config is None:
|
||||
vision_config = {}
|
||||
|
||||
# This is the complete result when using `vision_config_dict`.
|
||||
_vision_config_dict = AIMv2VisionConfig(**vision_config_dict).to_dict()
|
||||
# convert keys to string instead of integer
|
||||
if "id2label" in _vision_config_dict:
|
||||
_vision_config_dict["id2label"] = {
|
||||
str(key): value for key, value in _vision_config_dict["id2label"].items()
|
||||
}
|
||||
|
||||
# Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
|
||||
for key, value in _vision_config_dict.items():
|
||||
if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
|
||||
# If specified in `vision_config_dict`
|
||||
if key in vision_config_dict:
|
||||
message = (
|
||||
f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
|
||||
f'values. The value `vision_config_dict["{key}"]` will be used instead.'
|
||||
)
|
||||
# If inferred from default argument values (just to be super careful)
|
||||
else:
|
||||
message = (
|
||||
f"`vision_config_dict` is provided which will be used to initialize `AIMv2VisionConfig`. "
|
||||
f'The value `vision_config["{key}"]` will be overridden.'
|
||||
)
|
||||
logger.info(message)
|
||||
|
||||
# Update all values in `vision_config` with the ones in `_vision_config_dict`.
|
||||
vision_config.update(_vision_config_dict)
|
||||
|
||||
if text_config is None:
|
||||
text_config = {}
|
||||
logger.info("`text_config` is `None`. Initializing the `AIMv2TextConfig` with default values.")
|
||||
|
||||
if vision_config is None:
|
||||
vision_config = {}
|
||||
logger.info("`vision_config` is `None`. initializing the `AIMv2VisionConfig` with default values.")
|
||||
|
||||
self.text_config = AIMv2TextConfig(**text_config)
|
||||
self.vision_config = AIMv2VisionConfig(**vision_config)
|
||||
|
||||
self.projection_dim = projection_dim
|
||||
self.logit_scale_init_value = logit_scale_init_value
|
||||
self.initializer_factor = 1.0
|
||||
|
||||
@classmethod
|
||||
def from_text_vision_configs(cls, text_config: AIMv2TextConfig, vision_config: AIMv2VisionConfig, **kwargs):
|
||||
r"""
|
||||
Instantiate a [`AIMv2Config`] (or a derived class) from aimv2 text model configuration and aimv2 vision model
|
||||
configuration.
|
||||
|
||||
Returns:
|
||||
[`AIMv2Config`]: An instance of a configuration object
|
||||
"""
|
||||
|
||||
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
||||
|
||||
|
||||
class AIMv2OnnxConfig(OnnxConfig):
|
||||
@property
|
||||
def inputs(self) -> Mapping[str, Mapping[int, str]]:
|
||||
return OrderedDict(
|
||||
[
|
||||
("input_ids", {0: "batch", 1: "sequence"}),
|
||||
("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
|
||||
("attention_mask", {0: "batch", 1: "sequence"}),
|
||||
]
|
||||
)
|
||||
|
||||
@property
|
||||
def outputs(self) -> Mapping[str, Mapping[int, str]]:
|
||||
return OrderedDict(
|
||||
[
|
||||
("logits_per_image", {0: "batch"}),
|
||||
("logits_per_text", {0: "batch"}),
|
||||
("text_embeds", {0: "batch"}),
|
||||
("image_embeds", {0: "batch"}),
|
||||
]
|
||||
)
|
||||
|
||||
@property
|
||||
def atol_for_validation(self) -> float:
|
||||
return 1e-4
|
||||
|
||||
def generate_dummy_inputs(
|
||||
self,
|
||||
processor: "ProcessorMixin",
|
||||
batch_size: int = -1,
|
||||
seq_length: int = -1,
|
||||
framework: Optional["TensorType"] = None,
|
||||
) -> Mapping[str, Any]:
|
||||
text_input_dict = super().generate_dummy_inputs(
|
||||
processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
|
||||
)
|
||||
image_input_dict = super().generate_dummy_inputs(
|
||||
processor.image_processor, batch_size=batch_size, framework=framework
|
||||
)
|
||||
return {**text_input_dict, **image_input_dict}
|
||||
|
||||
@property
|
||||
def default_onnx_opset(self) -> int:
|
||||
return 14
|
||||
|
||||
|
||||
__all__ = ["AIMv2Config", "AIMv2OnnxConfig", "AIMv2TextConfig", "AIMv2VisionConfig"]
|
@ -0,0 +1,156 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
from aimv2 import load
|
||||
|
||||
from transformers import AIMv2Config, AIMv2Model
|
||||
|
||||
|
||||
def copy_attn_layer(hf_attn_layer, pt_attn_layer):
|
||||
q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
|
||||
q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
|
||||
|
||||
out_proj_weights = pt_attn_layer.out_proj.weight
|
||||
out_proj_bias = pt_attn_layer.out_proj.bias
|
||||
|
||||
hf_attn_layer.q_proj.weight.data = q_proj
|
||||
hf_attn_layer.q_proj.bias.data = q_proj_bias
|
||||
|
||||
hf_attn_layer.k_proj.weight.data = k_proj
|
||||
hf_attn_layer.k_proj.bias.data = k_proj_bias
|
||||
|
||||
hf_attn_layer.v_proj.weight.data = v_proj
|
||||
hf_attn_layer.v_proj.bias.data = v_proj_bias
|
||||
|
||||
hf_attn_layer.out_proj.weight = out_proj_weights
|
||||
hf_attn_layer.out_proj.bias = out_proj_bias
|
||||
|
||||
|
||||
def copy_mlp(hf_mlp, pt_mlp):
|
||||
copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
|
||||
copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
|
||||
|
||||
|
||||
def copy_linear(hf_linear, pt_linear):
|
||||
hf_linear.weight = pt_linear.weight
|
||||
hf_linear.bias = pt_linear.bias
|
||||
|
||||
|
||||
def copy_layer(hf_layer, pt_layer):
|
||||
# copy layer norms
|
||||
copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
|
||||
copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
|
||||
|
||||
# copy MLP
|
||||
copy_mlp(hf_layer.mlp, pt_layer.mlp)
|
||||
|
||||
# copy attn
|
||||
copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
|
||||
|
||||
|
||||
def copy_layers(hf_layers, pt_layers):
|
||||
for hf_layer, pt_layer in zip(hf_layers, pt_layers):
|
||||
copy_layer(hf_layer, pt_layer)
|
||||
|
||||
|
||||
def copy_encoder(hf_encoder, pt_model):
|
||||
# copy embeds
|
||||
hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
|
||||
hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
|
||||
|
||||
# copy layer norm
|
||||
copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
|
||||
|
||||
# copy hidden layers
|
||||
copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
|
||||
|
||||
|
||||
def copy_text_model_and_projection(hf_model, pt_model):
|
||||
# copy projection
|
||||
hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous()
|
||||
|
||||
# copy text encoder
|
||||
copy_encoder(hf_model.text_model, pt_model)
|
||||
|
||||
|
||||
def copy_vison_model_and_projection(hf_model, pt_model):
|
||||
# copy projection
|
||||
hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous()
|
||||
|
||||
# copy layer norms
|
||||
copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
|
||||
copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
|
||||
|
||||
# copy embeds
|
||||
hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
|
||||
hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
|
||||
hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
|
||||
|
||||
# copy encoder
|
||||
copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def convert_aimv2_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
|
||||
"""
|
||||
Copy/paste/tweak model's weights to transformers design.
|
||||
"""
|
||||
if config_path is not None:
|
||||
config = AIMv2Config.from_pretrained(config_path)
|
||||
else:
|
||||
config = AIMv2Config(projection_dim=512, text_config={}, vision_config={})
|
||||
|
||||
hf_model = AIMv2Model(config).eval()
|
||||
|
||||
pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
|
||||
pt_model = pt_model.eval()
|
||||
|
||||
copy_text_model_and_projection(hf_model, pt_model)
|
||||
copy_vison_model_and_projection(hf_model, pt_model)
|
||||
hf_model.logit_scale = pt_model.logit_scale
|
||||
|
||||
# Use `eos_token` so the example is more meaningful
|
||||
input_ids = torch.tensor(
|
||||
[
|
||||
[config.text_config.bos_token_id]
|
||||
+ list(range(3, 77))
|
||||
+ [config.text_config.eos_token_id]
|
||||
+ [config.text_config.pad_token_id]
|
||||
]
|
||||
)
|
||||
pixel_values = torch.randn(1, 3, 224, 224)
|
||||
|
||||
hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
|
||||
hf_logits_per_image = hf_outputs.logits_per_image
|
||||
hf_logits_per_text = hf_outputs.logits_per_text
|
||||
pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
|
||||
|
||||
assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
|
||||
assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
|
||||
|
||||
hf_model.save_pretrained(pytorch_dump_folder_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
|
||||
parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint")
|
||||
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
|
||||
args = parser.parse_args()
|
||||
|
||||
convert_aimv2_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
|
0
src/transformers/models/aimv2/modeling_aimv2.py
Normal file
0
src/transformers/models/aimv2/modeling_aimv2.py
Normal file
@ -64,6 +64,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
|
||||
("chinese_clip_vision_model", "ChineseCLIPVisionConfig"),
|
||||
("clap", "ClapConfig"),
|
||||
("clip", "CLIPConfig"),
|
||||
("aimv2", "AIMv2Config"),
|
||||
("clip_text_model", "CLIPTextConfig"),
|
||||
("clip_vision_model", "CLIPVisionConfig"),
|
||||
("clipseg", "CLIPSegConfig"),
|
||||
@ -396,6 +397,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
||||
("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
|
||||
("clap", "CLAP"),
|
||||
("clip", "CLIP"),
|
||||
("aimv2", "AIMv2"),
|
||||
("clip_text_model", "CLIPTextModel"),
|
||||
("clip_vision_model", "CLIPVisionModel"),
|
||||
("clipseg", "CLIPSeg"),
|
||||
|
@ -44,6 +44,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
|
||||
("chinese_clip", "ChineseCLIPFeatureExtractor"),
|
||||
("clap", "ClapFeatureExtractor"),
|
||||
("clip", "CLIPFeatureExtractor"),
|
||||
("aimv2", "AIMv2FeatureExtractor"),
|
||||
("clipseg", "ViTFeatureExtractor"),
|
||||
("clvp", "ClvpFeatureExtractor"),
|
||||
("conditional_detr", "ConditionalDetrFeatureExtractor"),
|
||||
|
@ -64,6 +64,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("chinese_clip_vision_model", "ChineseCLIPVisionModel"),
|
||||
("clap", "ClapModel"),
|
||||
("clip", "CLIPModel"),
|
||||
("aimv2", "AIMv2Model"),
|
||||
("clip_text_model", "CLIPTextModel"),
|
||||
("clip_vision_model", "CLIPVisionModel"),
|
||||
("clipseg", "CLIPSegModel"),
|
||||
@ -681,6 +682,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("beit", "BeitForImageClassification"),
|
||||
("bit", "BitForImageClassification"),
|
||||
("clip", "CLIPForImageClassification"),
|
||||
("aimv2", "AIMv2ForImageClassification"),
|
||||
("convnext", "ConvNextForImageClassification"),
|
||||
("convnextv2", "ConvNextV2ForImageClassification"),
|
||||
("cvt", "CvtForImageClassification"),
|
||||
@ -1416,6 +1418,7 @@ MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("blip-2", "Blip2ForImageTextRetrieval"),
|
||||
("chinese_clip", "ChineseCLIPModel"),
|
||||
("clip", "CLIPModel"),
|
||||
("aimv2", "AIMv2Model"),
|
||||
("clipseg", "CLIPSegModel"),
|
||||
("siglip", "SiglipModel"),
|
||||
("siglip2", "Siglip2Model"),
|
||||
|
@ -57,6 +57,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
|
||||
("chinese_clip", "ChineseCLIPProcessor"),
|
||||
("clap", "ClapProcessor"),
|
||||
("clip", "CLIPProcessor"),
|
||||
("aimv2", "AIMv2Processor"),
|
||||
("clipseg", "CLIPSegProcessor"),
|
||||
("clvp", "ClvpProcessor"),
|
||||
("colpali", "ColPaliProcessor"),
|
||||
|
@ -131,6 +131,13 @@ else:
|
||||
"CLIPTokenizerFast" if is_tokenizers_available() else None,
|
||||
),
|
||||
),
|
||||
(
|
||||
"aimv2",
|
||||
(
|
||||
"CLIPTokenizer",
|
||||
"CLIPTokenizerFast" if is_tokenizers_available() else None,
|
||||
),
|
||||
),
|
||||
(
|
||||
"clipseg",
|
||||
(
|
||||
|
0
tests/models/aimv2/__init__.py
Normal file
0
tests/models/aimv2/__init__.py
Normal file
1140
tests/models/aimv2/test_modeling_aimv2.py
Normal file
1140
tests/models/aimv2/test_modeling_aimv2.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user