mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Cleanup the usage of layer_norm_eps
in some models (#21336)
* fix * fix * make style * For CLIP * For OwlViT * For XCLIP * For CLIPSeg * For GroupViT * fix docstrings * fix docstrings * For AltCLIP * For ChineseCLIP * For Blip * For GiT * make style * update * update * update * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
623346ab18
commit
98d40fed3a
@ -173,8 +173,9 @@ class AltCLIPVisionConfig(PretrainedConfig):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
|
||||
defaults to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -213,7 +214,7 @@ class AltCLIPVisionConfig(PretrainedConfig):
|
||||
image_size=224,
|
||||
patch_size=32,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
|
@ -844,9 +844,9 @@ class AltCLIPEncoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self.self_attn = AltCLIPAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = AltCLIPMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -1099,9 +1099,9 @@ class AltCLIPVisionTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
|
||||
self.embeddings = AltCLIPVisionEmbeddings(config)
|
||||
self.pre_layrnorm = nn.LayerNorm(embed_dim)
|
||||
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
self.encoder = AltCLIPEncoder(config)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig)
|
||||
|
@ -74,8 +74,9 @@ class BlipTextConfig(PretrainedConfig):
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
|
||||
to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -207,8 +208,9 @@ class BlipVisionConfig(PretrainedConfig):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
|
||||
to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -247,7 +249,7 @@ class BlipVisionConfig(PretrainedConfig):
|
||||
image_size=384,
|
||||
patch_size=16,
|
||||
hidden_act="gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=1e-10,
|
||||
|
@ -374,9 +374,9 @@ class BlipEncoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self.self_attn = BlipAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = BlipMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -665,7 +665,7 @@ class BlipVisionModel(BlipPreTrainedModel):
|
||||
|
||||
self.embeddings = BlipVisionEmbeddings(config)
|
||||
self.encoder = BlipEncoder(config)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
self.post_init()
|
||||
|
||||
|
@ -187,8 +187,9 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
|
||||
defaults to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -225,7 +226,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
|
||||
image_size=224,
|
||||
patch_size=32,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
|
@ -626,9 +626,9 @@ class ChineseCLIPVisionLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self.self_attn = ChineseCLIPVisionAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = ChineseCLIPVisionMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -1054,9 +1054,9 @@ class ChineseCLIPVisionTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
|
||||
self.embeddings = ChineseCLIPVisionEmbeddings(config)
|
||||
self.pre_layrnorm = nn.LayerNorm(embed_dim)
|
||||
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
self.encoder = ChineseCLIPVisionEncoder(config)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig)
|
||||
|
@ -64,8 +64,9 @@ class CLIPTextConfig(PretrainedConfig):
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
|
||||
defaults to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -102,7 +103,7 @@ class CLIPTextConfig(PretrainedConfig):
|
||||
num_attention_heads=8,
|
||||
max_position_embeddings=77,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
@ -171,8 +172,9 @@ class CLIPVisionConfig(PretrainedConfig):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
|
||||
defaults to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -211,7 +213,7 @@ class CLIPVisionConfig(PretrainedConfig):
|
||||
image_size=224,
|
||||
patch_size=32,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
|
@ -356,9 +356,9 @@ class CLIPEncoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self.self_attn = CLIPAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = CLIPMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -680,7 +680,7 @@ class CLIPTextTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
self.embeddings = CLIPTextEmbeddings(config)
|
||||
self.encoder = CLIPEncoder(config)
|
||||
self.final_layer_norm = nn.LayerNorm(embed_dim)
|
||||
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
|
||||
@ -830,9 +830,9 @@ class CLIPVisionTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
|
||||
self.embeddings = CLIPVisionEmbeddings(config)
|
||||
self.pre_layrnorm = nn.LayerNorm(embed_dim)
|
||||
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
self.encoder = CLIPEncoder(config)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
|
||||
|
@ -56,8 +56,9 @@ class CLIPSegTextConfig(PretrainedConfig):
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
|
||||
defaults to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -93,7 +94,7 @@ class CLIPSegTextConfig(PretrainedConfig):
|
||||
num_attention_heads=8,
|
||||
max_position_embeddings=77,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
@ -161,8 +162,9 @@ class CLIPSegVisionConfig(PretrainedConfig):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
|
||||
defaults to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -200,7 +202,7 @@ class CLIPSegVisionConfig(PretrainedConfig):
|
||||
image_size=224,
|
||||
patch_size=32,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
@ -270,8 +272,7 @@ class CLIPSegConfig(PretrainedConfig):
|
||||
The dropout ratio for the attention probabilities.
|
||||
decoder_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
|
||||
defaults to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
decoder_intermediate_size (`int`, *optional*, defaults to 2048):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
|
||||
conditional_layer (`int`, *optional*, defaults to 0):
|
||||
|
@ -379,9 +379,9 @@ class CLIPSegEncoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self.self_attn = CLIPSegAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = CLIPSegMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -691,7 +691,7 @@ class CLIPSegTextTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
self.embeddings = CLIPSegTextEmbeddings(config)
|
||||
self.encoder = CLIPSegEncoder(config)
|
||||
self.final_layer_norm = nn.LayerNorm(embed_dim)
|
||||
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig)
|
||||
@ -837,9 +837,9 @@ class CLIPSegVisionTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
|
||||
self.embeddings = CLIPSegVisionEmbeddings(config)
|
||||
self.pre_layrnorm = nn.LayerNorm(embed_dim)
|
||||
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
self.encoder = CLIPSegEncoder(config)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
|
||||
@ -1178,9 +1178,9 @@ class CLIPSegDecoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self.self_attn = CLIPSegAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = CLIPSegMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
@ -54,8 +54,9 @@ class GitVisionConfig(PretrainedConfig):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
|
||||
defaults to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -94,7 +95,7 @@ class GitVisionConfig(PretrainedConfig):
|
||||
image_size=224,
|
||||
patch_size=16,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
|
@ -762,9 +762,9 @@ class GitVisionEncoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self.self_attn = GitVisionAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = GitVisionMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -935,9 +935,9 @@ class GitVisionTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
|
||||
self.embeddings = GitVisionEmbeddings(config)
|
||||
self.pre_layrnorm = nn.LayerNorm(embed_dim)
|
||||
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
self.encoder = GitVisionEncoder(config)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig)
|
||||
@ -1048,7 +1048,8 @@ class GitProjection(nn.Module):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.visual_projection = nn.Sequential(
|
||||
nn.Linear(config.vision_config.hidden_size, config.hidden_size), nn.LayerNorm(config.hidden_size)
|
||||
nn.Linear(config.vision_config.hidden_size, config.hidden_size),
|
||||
nn.LayerNorm(config.hidden_size, eps=config.vision_config.layer_norm_eps),
|
||||
)
|
||||
|
||||
def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
|
||||
|
@ -100,7 +100,7 @@ class GroupViTTextConfig(PretrainedConfig):
|
||||
num_attention_heads=4,
|
||||
max_position_embeddings=77,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
|
@ -714,9 +714,9 @@ class GroupViTEncoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self.self_attn = GroupViTAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = GroupViTMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -1076,7 +1076,7 @@ class GroupViTTextTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
self.embeddings = GroupViTTextEmbeddings(config)
|
||||
self.encoder = GroupViTTextEncoder(config)
|
||||
self.final_layer_norm = nn.LayerNorm(embed_dim)
|
||||
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTTextConfig)
|
||||
@ -1219,7 +1219,7 @@ class GroupViTVisionTransformer(nn.Module):
|
||||
|
||||
self.embeddings = GroupViTVisionEmbeddings(config)
|
||||
self.encoder = GroupViTVisionEncoder(config)
|
||||
self.layernorm = nn.LayerNorm(embed_dim)
|
||||
self.layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
|
||||
|
@ -1063,13 +1063,13 @@ class OneFormerPixelDecoderEncoderLayer(nn.Module):
|
||||
n_points=4,
|
||||
)
|
||||
|
||||
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.dropout = config.dropout
|
||||
self.activation_fn = nn.functional.relu
|
||||
self.activation_dropout = config.dropout
|
||||
self.fc1 = nn.Linear(self.embed_dim, config.encoder_feedforward_dim)
|
||||
self.fc2 = nn.Linear(config.encoder_feedforward_dim, self.embed_dim)
|
||||
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
self.final_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
self.is_training = config.is_training
|
||||
|
||||
@ -1634,11 +1634,13 @@ class OneFormerAttention(nn.Module):
|
||||
|
||||
|
||||
class OneFormerTransformerDecoderSelfAttentionLayer(nn.Module):
|
||||
def __init__(self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False):
|
||||
def __init__(
|
||||
self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False, layer_norm_eps=1e-05
|
||||
):
|
||||
super().__init__()
|
||||
self.self_attn = OneFormerAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, is_decoder=True)
|
||||
|
||||
self.norm = nn.LayerNorm(embed_dim)
|
||||
self.norm = nn.LayerNorm(embed_dim, eps=layer_norm_eps)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
self.activation = ACT2FN[activation]
|
||||
@ -1690,11 +1692,13 @@ class OneFormerTransformerDecoderSelfAttentionLayer(nn.Module):
|
||||
|
||||
|
||||
class OneFormerTransformerDecoderCrossAttentionLayer(nn.Module):
|
||||
def __init__(self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False):
|
||||
def __init__(
|
||||
self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False, layer_norm_eps=1e-05
|
||||
):
|
||||
super().__init__()
|
||||
self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
|
||||
|
||||
self.norm = nn.LayerNorm(embed_dim)
|
||||
self.norm = nn.LayerNorm(embed_dim, eps=layer_norm_eps)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
self.activation = ACT2FN[activation]
|
||||
@ -1760,14 +1764,22 @@ class OneFormerTransformerDecoderCrossAttentionLayer(nn.Module):
|
||||
|
||||
|
||||
class OneFormerTransformerDecoderFFNLayer(nn.Module):
|
||||
def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu", normalize_before=False):
|
||||
def __init__(
|
||||
self,
|
||||
d_model,
|
||||
dim_feedforward=2048,
|
||||
dropout=0.0,
|
||||
activation="relu",
|
||||
normalize_before=False,
|
||||
layer_norm_eps=1e-05,
|
||||
):
|
||||
super().__init__()
|
||||
# Implementation of Feedforward model
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
|
||||
self.norm = nn.LayerNorm(d_model)
|
||||
self.norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
|
||||
|
||||
self.activation = ACT2FN[activation]
|
||||
self.normalize_before = normalize_before
|
||||
@ -1836,6 +1848,7 @@ class OneFormerTransformerDecoderLayer(nn.Module):
|
||||
num_heads=config.num_attention_heads,
|
||||
dropout=0.0,
|
||||
normalize_before=config.pre_norm,
|
||||
layer_norm_eps=config.layer_norm_eps,
|
||||
)
|
||||
|
||||
self.self_attn = OneFormerTransformerDecoderSelfAttentionLayer(
|
||||
@ -1843,6 +1856,7 @@ class OneFormerTransformerDecoderLayer(nn.Module):
|
||||
num_heads=config.num_attention_heads,
|
||||
dropout=0.0,
|
||||
normalize_before=config.pre_norm,
|
||||
layer_norm_eps=config.layer_norm_eps,
|
||||
)
|
||||
|
||||
self.ffn = OneFormerTransformerDecoderFFNLayer(
|
||||
@ -1850,6 +1864,7 @@ class OneFormerTransformerDecoderLayer(nn.Module):
|
||||
dim_feedforward=config.dim_feedforward,
|
||||
dropout=0.0,
|
||||
normalize_before=config.pre_norm,
|
||||
layer_norm_eps=config.layer_norm_eps,
|
||||
)
|
||||
|
||||
def forward(
|
||||
@ -1965,6 +1980,7 @@ class OneFormerTransformerDecoderQueryTransformerDecoderLayer(nn.Module):
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
normalize_before=False,
|
||||
layer_norm_eps=1e-05,
|
||||
):
|
||||
super().__init__()
|
||||
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
||||
@ -1974,9 +1990,9 @@ class OneFormerTransformerDecoderQueryTransformerDecoderLayer(nn.Module):
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
|
||||
self.norm1 = nn.LayerNorm(d_model)
|
||||
self.norm2 = nn.LayerNorm(d_model)
|
||||
self.norm3 = nn.LayerNorm(d_model)
|
||||
self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
|
||||
self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
|
||||
self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
self.dropout3 = nn.Dropout(dropout)
|
||||
@ -2094,13 +2110,14 @@ class OneFormerTransformerDecoderQueryTransformer(nn.Module):
|
||||
activation="relu",
|
||||
normalize_before=False,
|
||||
return_intermediate_dec=False,
|
||||
layer_norm_eps=1e-05,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
decoder_layer = OneFormerTransformerDecoderQueryTransformerDecoderLayer(
|
||||
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
|
||||
d_model, nhead, dim_feedforward, dropout, activation, normalize_before, layer_norm_eps
|
||||
)
|
||||
decoder_norm = nn.LayerNorm(d_model)
|
||||
decoder_norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
|
||||
self.decoder = OneFormerTransformerDecoderQueryTransformerDecoder(
|
||||
decoder_layer,
|
||||
num_decoder_layers,
|
||||
@ -2151,9 +2168,10 @@ class OneFormerTransformerDecoder(nn.Module):
|
||||
num_decoder_layers=config.query_dec_layers,
|
||||
normalize_before=config.pre_norm,
|
||||
return_intermediate_dec=False,
|
||||
layer_norm_eps=config.layer_norm_eps,
|
||||
)
|
||||
|
||||
self.decoder_norm = nn.LayerNorm(config.hidden_dim)
|
||||
self.decoder_norm = nn.LayerNorm(config.hidden_dim, eps=config.layer_norm_eps)
|
||||
|
||||
self.num_feature_levels = 3
|
||||
|
||||
@ -2456,14 +2474,15 @@ class OneFormerTextTransformerDecoderLayer(nn.Module):
|
||||
d_model,
|
||||
nhead,
|
||||
dropout=0.1,
|
||||
layer_norm_eps=1e-05,
|
||||
):
|
||||
super().__init__()
|
||||
self.self_attn = OneFormerTextMapperAttention(d_model, nhead, proj_drop=dropout)
|
||||
self.cross_attn = OneFormerTextMapperAttention(d_model, nhead, proj_drop=dropout)
|
||||
|
||||
self.norm1 = nn.LayerNorm(d_model)
|
||||
self.norm2 = nn.LayerNorm(d_model)
|
||||
self.norm3 = nn.LayerNorm(d_model)
|
||||
self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
|
||||
self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
|
||||
self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
self.mlp = nn.Sequential(
|
||||
@ -2481,29 +2500,38 @@ class OneFormerTextTransformerDecoderLayer(nn.Module):
|
||||
|
||||
class OneFormerTextContextDecoder(nn.Module):
|
||||
def __init__(
|
||||
self, transformer_width=256, transformer_heads=4, transformer_layers=6, visual_dim=1024, dropout=0.1, **kwargs
|
||||
self,
|
||||
transformer_width=256,
|
||||
transformer_heads=4,
|
||||
transformer_layers=6,
|
||||
visual_dim=1024,
|
||||
dropout=0.1,
|
||||
layer_norm_eps=1e-05,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.memory_proj = nn.Sequential(
|
||||
nn.LayerNorm(visual_dim),
|
||||
nn.LayerNorm(visual_dim, eps=layer_norm_eps),
|
||||
nn.Linear(visual_dim, transformer_width),
|
||||
nn.LayerNorm(transformer_width),
|
||||
nn.LayerNorm(transformer_width, eps=layer_norm_eps),
|
||||
)
|
||||
|
||||
self.text_proj = nn.Sequential(
|
||||
nn.LayerNorm(visual_dim),
|
||||
nn.LayerNorm(visual_dim, eps=layer_norm_eps),
|
||||
nn.Linear(visual_dim, transformer_width),
|
||||
)
|
||||
|
||||
self.decoder = nn.ModuleList(
|
||||
[
|
||||
OneFormerTextTransformerDecoderLayer(transformer_width, transformer_heads, dropout)
|
||||
OneFormerTextTransformerDecoderLayer(transformer_width, transformer_heads, dropout, layer_norm_eps)
|
||||
for _ in range(transformer_layers)
|
||||
]
|
||||
)
|
||||
|
||||
self.out_proj = nn.Sequential(nn.LayerNorm(transformer_width), nn.Linear(transformer_width, visual_dim))
|
||||
self.out_proj = nn.Sequential(
|
||||
nn.LayerNorm(transformer_width, eps=layer_norm_eps), nn.Linear(transformer_width, visual_dim)
|
||||
)
|
||||
|
||||
def forward(self, text, visual):
|
||||
visual = self.memory_proj(visual)
|
||||
@ -2538,12 +2566,12 @@ class OneFormerTextMLP(nn.Module):
|
||||
|
||||
|
||||
class OneFormerTextTransformerLayer(nn.Module):
|
||||
def __init__(self, width: int, heads: int, attn_mask: torch.Tensor):
|
||||
def __init__(self, width: int, heads: int, attn_mask: torch.Tensor, layer_norm_eps=1e-05):
|
||||
super().__init__()
|
||||
self.self_attn = nn.MultiheadAttention(width, heads)
|
||||
self.layer_norm1 = nn.LayerNorm(width)
|
||||
self.layer_norm1 = nn.LayerNorm(width, eps=layer_norm_eps)
|
||||
self.mlp = OneFormerTextMLP(width, width * 4, width)
|
||||
self.layer_norm2 = nn.LayerNorm(width)
|
||||
self.layer_norm2 = nn.LayerNorm(width, eps=layer_norm_eps)
|
||||
self.attn_mask = attn_mask
|
||||
|
||||
def forward(
|
||||
@ -2572,11 +2600,21 @@ class OneFormerTextTransformerLayer(nn.Module):
|
||||
|
||||
|
||||
class OneFormerTextTransformer(nn.Module):
|
||||
def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_checkpoint=False):
|
||||
def __init__(
|
||||
self,
|
||||
width: int,
|
||||
layers: int,
|
||||
heads: int,
|
||||
attn_mask: torch.Tensor = None,
|
||||
use_checkpoint=False,
|
||||
layer_norm_eps=1e-05,
|
||||
):
|
||||
super().__init__()
|
||||
self.width = width
|
||||
self.num_layers = layers
|
||||
self.layers = nn.Sequential(*[OneFormerTextTransformerLayer(width, heads, attn_mask) for _ in range(layers)])
|
||||
self.layers = nn.Sequential(
|
||||
*[OneFormerTextTransformerLayer(width, heads, attn_mask, layer_norm_eps) for _ in range(layers)]
|
||||
)
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor):
|
||||
@ -2596,6 +2634,7 @@ class OneFormerTextEncoder(nn.Module):
|
||||
layers: int,
|
||||
vocab_size,
|
||||
use_checkpoint=False,
|
||||
layer_norm_eps=1e-05,
|
||||
):
|
||||
super().__init__()
|
||||
heads = width // 64
|
||||
@ -2607,10 +2646,11 @@ class OneFormerTextEncoder(nn.Module):
|
||||
heads=heads,
|
||||
attn_mask=self.build_attention_mask(),
|
||||
use_checkpoint=use_checkpoint,
|
||||
layer_norm_eps=layer_norm_eps,
|
||||
)
|
||||
|
||||
self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width))
|
||||
self.ln_final = nn.LayerNorm(width)
|
||||
self.ln_final = nn.LayerNorm(width, eps=layer_norm_eps)
|
||||
self.token_embedding = nn.Embedding(vocab_size, width)
|
||||
|
||||
def build_attention_mask(self):
|
||||
@ -2641,6 +2681,7 @@ class OneFormerTextMapper(nn.Module):
|
||||
width=config.text_encoder_width,
|
||||
layers=config.text_encoder_num_layers,
|
||||
vocab_size=config.text_encoder_vocab_size,
|
||||
layer_norm_eps=config.layer_norm_eps,
|
||||
)
|
||||
|
||||
self.text_projector = OneFormerMLPPredictionHead(
|
||||
|
@ -66,8 +66,9 @@ class OwlViTTextConfig(PretrainedConfig):
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
|
||||
defaults to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -103,7 +104,7 @@ class OwlViTTextConfig(PretrainedConfig):
|
||||
num_attention_heads=8,
|
||||
max_position_embeddings=16,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
@ -173,8 +174,9 @@ class OwlViTVisionConfig(PretrainedConfig):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
|
||||
defaults to 1e-5): The epsilon used by the layer normalization layers.
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
@ -212,7 +214,7 @@ class OwlViTVisionConfig(PretrainedConfig):
|
||||
image_size=768,
|
||||
patch_size=32,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
|
@ -476,9 +476,9 @@ class OwlViTEncoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self.self_attn = OwlViTAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = OwlViTMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -790,7 +790,7 @@ class OwlViTTextTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
self.embeddings = OwlViTTextEmbeddings(config)
|
||||
self.encoder = OwlViTEncoder(config)
|
||||
self.final_layer_norm = nn.LayerNorm(embed_dim)
|
||||
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTTextConfig)
|
||||
@ -922,9 +922,9 @@ class OwlViTVisionTransformer(nn.Module):
|
||||
self.config = config
|
||||
|
||||
self.embeddings = OwlViTVisionEmbeddings(config)
|
||||
self.pre_layernorm = nn.LayerNorm(config.hidden_size)
|
||||
self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.encoder = OwlViTEncoder(config)
|
||||
self.post_layernorm = nn.LayerNorm(config.hidden_size)
|
||||
self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTVisionConfig)
|
||||
@ -1318,7 +1318,7 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
|
||||
self.class_head = OwlViTClassPredictionHead(config)
|
||||
self.box_head = OwlViTBoxPredictionHead(config)
|
||||
|
||||
self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size)
|
||||
self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
|
||||
self.sigmoid = nn.Sigmoid()
|
||||
|
||||
def normalize_grid_corner_coordinates(self, feature_map: torch.FloatTensor):
|
||||
|
@ -95,7 +95,7 @@ class XCLIPTextConfig(PretrainedConfig):
|
||||
num_attention_heads=8,
|
||||
max_position_embeddings=77,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
@ -220,7 +220,7 @@ class XCLIPVisionConfig(PretrainedConfig):
|
||||
patch_size=32,
|
||||
num_frames=8,
|
||||
hidden_act="quick_gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
layer_norm_eps=1e-5,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
|
@ -311,9 +311,9 @@ class XCLIPEncoderLayer(nn.Module):
|
||||
super().__init__()
|
||||
self.embed_dim = config.hidden_size
|
||||
self.self_attn = XCLIPAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = XCLIPMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -403,15 +403,15 @@ class XCLIPVisionEncoderLayer(nn.Module):
|
||||
self.embed_dim = config.hidden_size
|
||||
|
||||
self.message_fc = nn.Linear(self.embed_dim, self.embed_dim)
|
||||
self.message_ln = nn.LayerNorm(self.embed_dim)
|
||||
self.message_ln = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.message_attn = XCLIPAttention(config)
|
||||
|
||||
self.drop_path = XCLIPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
|
||||
|
||||
self.self_attn = XCLIPAttention(config)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
self.mlp = XCLIPMLP(config)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
|
||||
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -744,7 +744,7 @@ class XCLIPTextTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
self.embeddings = XCLIPTextEmbeddings(config)
|
||||
self.encoder = XCLIPEncoder(config)
|
||||
self.final_layer_norm = nn.LayerNorm(embed_dim)
|
||||
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
|
||||
@ -989,9 +989,9 @@ class XCLIPVisionTransformer(nn.Module):
|
||||
embed_dim = config.hidden_size
|
||||
|
||||
self.embeddings = XCLIPVisionEmbeddings(config)
|
||||
self.pre_layernorm = nn.LayerNorm(embed_dim)
|
||||
self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
self.encoder = XCLIPVisionEncoder(config)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim)
|
||||
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
||||
|
||||
@add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
|
||||
@ -1218,8 +1218,8 @@ class PromptGeneratorLayer(nn.Module):
|
||||
|
||||
embed_dim = config.projection_dim
|
||||
self.cross_attn = XCLIPCrossAttention(config)
|
||||
self.norm1 = nn.LayerNorm(embed_dim)
|
||||
self.norm3 = nn.LayerNorm(embed_dim)
|
||||
self.norm1 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
|
||||
self.norm3 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
|
||||
self.mlp = nn.Sequential(
|
||||
nn.Linear(embed_dim, embed_dim * 4),
|
||||
ACT2FN[config.prompt_hidden_act],
|
||||
@ -1239,7 +1239,7 @@ class XCLIPPromptGenerator(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
embed_dim = config.projection_dim
|
||||
self.layernorm = nn.LayerNorm(embed_dim)
|
||||
self.layernorm = nn.LayerNorm(embed_dim, eps=config.vision_config.layer_norm_eps)
|
||||
self.decoder = nn.ModuleList([PromptGeneratorLayer(config) for _ in range(config.prompt_layers)])
|
||||
self.alpha = nn.Parameter(torch.ones(embed_dim) * config.prompt_alpha)
|
||||
|
||||
@ -1284,7 +1284,7 @@ class XCLIPModel(XCLIPPreTrainedModel):
|
||||
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
|
||||
self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim)
|
||||
self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps)
|
||||
self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))
|
||||
|
||||
mit_config = copy(vision_config)
|
||||
|
Loading…
Reference in New Issue
Block a user