more fixes

This commit is contained in:
yaswant19 2025-03-29 09:08:05 +05:30
parent 104296a3dc
commit 019210c9c7
4 changed files with 5 additions and 14 deletions

View File

@ -50,10 +50,5 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
- forward
## AIMv2ForImageClassification
[[autodoc]] AIMv2ForImageClassification
- forward
</pt>
<tf>

View File

@ -6752,15 +6752,12 @@ if TYPE_CHECKING:
model_addition_debugger_context,
)
from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
from .modeling_utils import AttentionInterface, PreTrainedModel
from .models.aimv2 import (
AIMv2Model,
AIMv2TextModel,
AIMv2VisionModel,
)
from .models.albert import (
AlbertForMaskedLM,
AlbertForMultipleChoice,

View File

@ -79,6 +79,8 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
r"text_encoder.trunk.blocks.(\d+).norm_2": r"text_model.encoder.layers.\1.rms_norm2",
r"text_encoder.trunk.post_trunk_norm": r"text_model.rms_norm",
r"text_projector": r"text_projection",
r"log_logit_scale": r"logit_scale",
}
@ -169,7 +171,7 @@ def write_model(
state_dict = {}
# For `apple/aimv2-large-patch14-native` we don't have position_embedding in state_dict
strict_loading = True
strict_loading = False
result = convert_old_keys_to_new_keys(original_state_dict, key_mapping)
all_keys = list(original_state_dict.keys())

View File

@ -83,8 +83,7 @@ class AIMv2VisionConfig(SiglipVisionConfig):
The standard deviation of the for initializing all weight matrices.
use_head (`str`, *optional*, defaults to `True`):
Whether to use Attention Pooling Head or Not.
"""
"""
def __init__(
self,
hidden_size: int = 1024,
@ -174,8 +173,7 @@ class AIMv2TextConfig(SiglipTextConfig):
just in case (e.g., 512 or 1024 or 2048).
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the for initializing all weight matrices.
"""
"""
def __init__(
self,
vocab_size: int = 49408,
@ -268,7 +266,6 @@ class AIMv2Config(SiglipConfig):
>>> config = AIMv2Config.from_text_vision_configs(config_text, config_vision)
```"""
def __init__(
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
):