diff --git a/docs/source/en/model_doc/aimv2.md b/docs/source/en/model_doc/aimv2.md index 917e6d8d816..ee9abfe194f 100644 --- a/docs/source/en/model_doc/aimv2.md +++ b/docs/source/en/model_doc/aimv2.md @@ -50,10 +50,5 @@ The original code can be found [here](). - forward -## AIMv2ForImageClassification - -[[autodoc]] AIMv2ForImageClassification - - forward - diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 7979b236211..6972bcf8295 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -6752,15 +6752,12 @@ if TYPE_CHECKING: model_addition_debugger_context, ) from .modeling_rope_utils import ROPE_INIT_FUNCTIONS - from .modeling_utils import AttentionInterface, PreTrainedModel - from .models.aimv2 import ( AIMv2Model, AIMv2TextModel, AIMv2VisionModel, ) - from .models.albert import ( AlbertForMaskedLM, AlbertForMultipleChoice, diff --git a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py index 6efd2d023ef..b8120375c3d 100644 --- a/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py +++ b/src/transformers/models/aimv2/convert_aimv2_original_pytorch_to_hf.py @@ -79,6 +79,8 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = { r"text_encoder.trunk.blocks.(\d+).norm_2": r"text_model.encoder.layers.\1.rms_norm2", r"text_encoder.trunk.post_trunk_norm": r"text_model.rms_norm", r"text_projector": r"text_projection", + r"log_logit_scale": r"logit_scale", + } @@ -169,7 +171,7 @@ def write_model( state_dict = {} # For `apple/aimv2-large-patch14-native` we don't have position_embedding in state_dict - strict_loading = True + strict_loading = False result = convert_old_keys_to_new_keys(original_state_dict, key_mapping) all_keys = list(original_state_dict.keys()) diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py index 35fc2e00018..d2e94c0c7f0 100644 --- a/src/transformers/models/aimv2/modular_aimv2.py +++ b/src/transformers/models/aimv2/modular_aimv2.py @@ -83,8 +83,7 @@ class AIMv2VisionConfig(SiglipVisionConfig): The standard deviation of the for initializing all weight matrices. use_head (`str`, *optional*, defaults to `True`): Whether to use Attention Pooling Head or Not. - """ - + """ def __init__( self, hidden_size: int = 1024, @@ -174,8 +173,7 @@ class AIMv2TextConfig(SiglipTextConfig): just in case (e.g., 512 or 1024 or 2048). initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the for initializing all weight matrices. - """ - + """ def __init__( self, vocab_size: int = 49408, @@ -268,7 +266,6 @@ class AIMv2Config(SiglipConfig): >>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision) ```""" - def __init__( self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs ):