Cleanup the usage of layer_norm_eps in some models (#21336)

* fix

* fix

* make style

* For CLIP

* For OwlViT

* For XCLIP

* For CLIPSeg

* For GroupViT

* fix docstrings

* fix docstrings

* For AltCLIP

* For ChineseCLIP

* For Blip

* For GiT

* make style

* update

* update

* update

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar 2023-01-31 13:54:16 +01:00 committed by GitHub
parent 623346ab18
commit 98d40fed3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 168 additions and 116 deletions

View File

@ -173,8 +173,9 @@ class AltCLIPVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
defaults to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
@ -213,7 +214,7 @@ class AltCLIPVisionConfig(PretrainedConfig):
image_size=224,
patch_size=32,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,

View File

@ -844,9 +844,9 @@ class AltCLIPEncoderLayer(nn.Module):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = AltCLIPAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = AltCLIPMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
@ -1099,9 +1099,9 @@ class AltCLIPVisionTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = AltCLIPVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim)
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = AltCLIPEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig)

View File

@ -74,8 +74,9 @@ class BlipTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0):
@ -207,8 +208,9 @@ class BlipVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
@ -247,7 +249,7 @@ class BlipVisionConfig(PretrainedConfig):
image_size=384,
patch_size=16,
hidden_act="gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=1e-10,

View File

@ -374,9 +374,9 @@ class BlipEncoderLayer(nn.Module):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = BlipAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = BlipMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
@ -665,7 +665,7 @@ class BlipVisionModel(BlipPreTrainedModel):
self.embeddings = BlipVisionEmbeddings(config)
self.encoder = BlipEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.post_init()

View File

@ -187,8 +187,9 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
defaults to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
@ -225,7 +226,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
image_size=224,
patch_size=32,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,

View File

@ -626,9 +626,9 @@ class ChineseCLIPVisionLayer(nn.Module):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = ChineseCLIPVisionAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = ChineseCLIPVisionMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
@ -1054,9 +1054,9 @@ class ChineseCLIPVisionTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = ChineseCLIPVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim)
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = ChineseCLIPVisionEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig)

View File

@ -64,8 +64,9 @@ class CLIPTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
defaults to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0):
@ -102,7 +103,7 @@ class CLIPTextConfig(PretrainedConfig):
num_attention_heads=8,
max_position_embeddings=77,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,
@ -171,8 +172,9 @@ class CLIPVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
defaults to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
@ -211,7 +213,7 @@ class CLIPVisionConfig(PretrainedConfig):
image_size=224,
patch_size=32,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,

View File

@ -356,9 +356,9 @@ class CLIPEncoderLayer(nn.Module):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = CLIPAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = CLIPMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
@ -680,7 +680,7 @@ class CLIPTextTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = CLIPTextEmbeddings(config)
self.encoder = CLIPEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim)
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
@ -830,9 +830,9 @@ class CLIPVisionTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = CLIPVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim)
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = CLIPEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)

View File

@ -56,8 +56,9 @@ class CLIPSegTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
defaults to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0):
@ -93,7 +94,7 @@ class CLIPSegTextConfig(PretrainedConfig):
num_attention_heads=8,
max_position_embeddings=77,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,
@ -161,8 +162,9 @@ class CLIPSegVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
defaults to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
@ -200,7 +202,7 @@ class CLIPSegVisionConfig(PretrainedConfig):
image_size=224,
patch_size=32,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,
@ -270,8 +272,7 @@ class CLIPSegConfig(PretrainedConfig):
The dropout ratio for the attention probabilities.
decoder_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
defaults to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
decoder_intermediate_size (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
conditional_layer (`int`, *optional*, defaults to 0):

View File

@ -379,9 +379,9 @@ class CLIPSegEncoderLayer(nn.Module):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = CLIPSegAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = CLIPSegMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
@ -691,7 +691,7 @@ class CLIPSegTextTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = CLIPSegTextEmbeddings(config)
self.encoder = CLIPSegEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim)
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig)
@ -837,9 +837,9 @@ class CLIPSegVisionTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = CLIPSegVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim)
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = CLIPSegEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig)
@ -1178,9 +1178,9 @@ class CLIPSegDecoderLayer(nn.Module):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = CLIPSegAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = CLIPSegMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,

View File

@ -54,8 +54,9 @@ class GitVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
defaults to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
@ -94,7 +95,7 @@ class GitVisionConfig(PretrainedConfig):
image_size=224,
patch_size=16,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,

View File

@ -762,9 +762,9 @@ class GitVisionEncoderLayer(nn.Module):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = GitVisionAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = GitVisionMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
@ -935,9 +935,9 @@ class GitVisionTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = GitVisionEmbeddings(config)
self.pre_layrnorm = nn.LayerNorm(embed_dim)
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = GitVisionEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig)
@ -1048,7 +1048,8 @@ class GitProjection(nn.Module):
super().__init__()
self.config = config
self.visual_projection = nn.Sequential(
nn.Linear(config.vision_config.hidden_size, config.hidden_size), nn.LayerNorm(config.hidden_size)
nn.Linear(config.vision_config.hidden_size, config.hidden_size),
nn.LayerNorm(config.hidden_size, eps=config.vision_config.layer_norm_eps),
)
def forward(self, embeddings: torch.Tensor) -> torch.Tensor:

View File

@ -100,7 +100,7 @@ class GroupViTTextConfig(PretrainedConfig):
num_attention_heads=4,
max_position_embeddings=77,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,

View File

@ -714,9 +714,9 @@ class GroupViTEncoderLayer(nn.Module):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = GroupViTAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = GroupViTMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
@ -1076,7 +1076,7 @@ class GroupViTTextTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = GroupViTTextEmbeddings(config)
self.encoder = GroupViTTextEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim)
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTTextConfig)
@ -1219,7 +1219,7 @@ class GroupViTVisionTransformer(nn.Module):
self.embeddings = GroupViTVisionEmbeddings(config)
self.encoder = GroupViTVisionEncoder(config)
self.layernorm = nn.LayerNorm(embed_dim)
self.layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=GroupViTVisionConfig)

View File

@ -1063,13 +1063,13 @@ class OneFormerPixelDecoderEncoderLayer(nn.Module):
n_points=4,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.dropout = config.dropout
self.activation_fn = nn.functional.relu
self.activation_dropout = config.dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_feedforward_dim)
self.fc2 = nn.Linear(config.encoder_feedforward_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.is_training = config.is_training
@ -1634,11 +1634,13 @@ class OneFormerAttention(nn.Module):
class OneFormerTransformerDecoderSelfAttentionLayer(nn.Module):
def __init__(self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False):
def __init__(
self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False, layer_norm_eps=1e-05
):
super().__init__()
self.self_attn = OneFormerAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, is_decoder=True)
self.norm = nn.LayerNorm(embed_dim)
self.norm = nn.LayerNorm(embed_dim, eps=layer_norm_eps)
self.dropout = nn.Dropout(dropout)
self.activation = ACT2FN[activation]
@ -1690,11 +1692,13 @@ class OneFormerTransformerDecoderSelfAttentionLayer(nn.Module):
class OneFormerTransformerDecoderCrossAttentionLayer(nn.Module):
def __init__(self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False):
def __init__(
self, embed_dim, num_heads, dropout=0.0, activation="relu", normalize_before=False, layer_norm_eps=1e-05
):
super().__init__()
self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
self.norm = nn.LayerNorm(embed_dim)
self.norm = nn.LayerNorm(embed_dim, eps=layer_norm_eps)
self.dropout = nn.Dropout(dropout)
self.activation = ACT2FN[activation]
@ -1760,14 +1764,22 @@ class OneFormerTransformerDecoderCrossAttentionLayer(nn.Module):
class OneFormerTransformerDecoderFFNLayer(nn.Module):
def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu", normalize_before=False):
def __init__(
self,
d_model,
dim_feedforward=2048,
dropout=0.0,
activation="relu",
normalize_before=False,
layer_norm_eps=1e-05,
):
super().__init__()
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm = nn.LayerNorm(d_model)
self.norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.activation = ACT2FN[activation]
self.normalize_before = normalize_before
@ -1836,6 +1848,7 @@ class OneFormerTransformerDecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
dropout=0.0,
normalize_before=config.pre_norm,
layer_norm_eps=config.layer_norm_eps,
)
self.self_attn = OneFormerTransformerDecoderSelfAttentionLayer(
@ -1843,6 +1856,7 @@ class OneFormerTransformerDecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
dropout=0.0,
normalize_before=config.pre_norm,
layer_norm_eps=config.layer_norm_eps,
)
self.ffn = OneFormerTransformerDecoderFFNLayer(
@ -1850,6 +1864,7 @@ class OneFormerTransformerDecoderLayer(nn.Module):
dim_feedforward=config.dim_feedforward,
dropout=0.0,
normalize_before=config.pre_norm,
layer_norm_eps=config.layer_norm_eps,
)
def forward(
@ -1965,6 +1980,7 @@ class OneFormerTransformerDecoderQueryTransformerDecoderLayer(nn.Module):
dropout=0.1,
activation="relu",
normalize_before=False,
layer_norm_eps=1e-05,
):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
@ -1974,9 +1990,9 @@ class OneFormerTransformerDecoderQueryTransformerDecoderLayer(nn.Module):
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
@ -2094,13 +2110,14 @@ class OneFormerTransformerDecoderQueryTransformer(nn.Module):
activation="relu",
normalize_before=False,
return_intermediate_dec=False,
layer_norm_eps=1e-05,
):
super().__init__()
decoder_layer = OneFormerTransformerDecoderQueryTransformerDecoderLayer(
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
d_model, nhead, dim_feedforward, dropout, activation, normalize_before, layer_norm_eps
)
decoder_norm = nn.LayerNorm(d_model)
decoder_norm = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.decoder = OneFormerTransformerDecoderQueryTransformerDecoder(
decoder_layer,
num_decoder_layers,
@ -2151,9 +2168,10 @@ class OneFormerTransformerDecoder(nn.Module):
num_decoder_layers=config.query_dec_layers,
normalize_before=config.pre_norm,
return_intermediate_dec=False,
layer_norm_eps=config.layer_norm_eps,
)
self.decoder_norm = nn.LayerNorm(config.hidden_dim)
self.decoder_norm = nn.LayerNorm(config.hidden_dim, eps=config.layer_norm_eps)
self.num_feature_levels = 3
@ -2456,14 +2474,15 @@ class OneFormerTextTransformerDecoderLayer(nn.Module):
d_model,
nhead,
dropout=0.1,
layer_norm_eps=1e-05,
):
super().__init__()
self.self_attn = OneFormerTextMapperAttention(d_model, nhead, proj_drop=dropout)
self.cross_attn = OneFormerTextMapperAttention(d_model, nhead, proj_drop=dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
self.dropout = nn.Dropout(dropout)
self.mlp = nn.Sequential(
@ -2481,29 +2500,38 @@ class OneFormerTextTransformerDecoderLayer(nn.Module):
class OneFormerTextContextDecoder(nn.Module):
def __init__(
self, transformer_width=256, transformer_heads=4, transformer_layers=6, visual_dim=1024, dropout=0.1, **kwargs
self,
transformer_width=256,
transformer_heads=4,
transformer_layers=6,
visual_dim=1024,
dropout=0.1,
layer_norm_eps=1e-05,
**kwargs
):
super().__init__()
self.memory_proj = nn.Sequential(
nn.LayerNorm(visual_dim),
nn.LayerNorm(visual_dim, eps=layer_norm_eps),
nn.Linear(visual_dim, transformer_width),
nn.LayerNorm(transformer_width),
nn.LayerNorm(transformer_width, eps=layer_norm_eps),
)
self.text_proj = nn.Sequential(
nn.LayerNorm(visual_dim),
nn.LayerNorm(visual_dim, eps=layer_norm_eps),
nn.Linear(visual_dim, transformer_width),
)
self.decoder = nn.ModuleList(
[
OneFormerTextTransformerDecoderLayer(transformer_width, transformer_heads, dropout)
OneFormerTextTransformerDecoderLayer(transformer_width, transformer_heads, dropout, layer_norm_eps)
for _ in range(transformer_layers)
]
)
self.out_proj = nn.Sequential(nn.LayerNorm(transformer_width), nn.Linear(transformer_width, visual_dim))
self.out_proj = nn.Sequential(
nn.LayerNorm(transformer_width, eps=layer_norm_eps), nn.Linear(transformer_width, visual_dim)
)
def forward(self, text, visual):
visual = self.memory_proj(visual)
@ -2538,12 +2566,12 @@ class OneFormerTextMLP(nn.Module):
class OneFormerTextTransformerLayer(nn.Module):
def __init__(self, width: int, heads: int, attn_mask: torch.Tensor):
def __init__(self, width: int, heads: int, attn_mask: torch.Tensor, layer_norm_eps=1e-05):
super().__init__()
self.self_attn = nn.MultiheadAttention(width, heads)
self.layer_norm1 = nn.LayerNorm(width)
self.layer_norm1 = nn.LayerNorm(width, eps=layer_norm_eps)
self.mlp = OneFormerTextMLP(width, width * 4, width)
self.layer_norm2 = nn.LayerNorm(width)
self.layer_norm2 = nn.LayerNorm(width, eps=layer_norm_eps)
self.attn_mask = attn_mask
def forward(
@ -2572,11 +2600,21 @@ class OneFormerTextTransformerLayer(nn.Module):
class OneFormerTextTransformer(nn.Module):
def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_checkpoint=False):
def __init__(
self,
width: int,
layers: int,
heads: int,
attn_mask: torch.Tensor = None,
use_checkpoint=False,
layer_norm_eps=1e-05,
):
super().__init__()
self.width = width
self.num_layers = layers
self.layers = nn.Sequential(*[OneFormerTextTransformerLayer(width, heads, attn_mask) for _ in range(layers)])
self.layers = nn.Sequential(
*[OneFormerTextTransformerLayer(width, heads, attn_mask, layer_norm_eps) for _ in range(layers)]
)
self.use_checkpoint = use_checkpoint
def forward(self, hidden_states: torch.Tensor):
@ -2596,6 +2634,7 @@ class OneFormerTextEncoder(nn.Module):
layers: int,
vocab_size,
use_checkpoint=False,
layer_norm_eps=1e-05,
):
super().__init__()
heads = width // 64
@ -2607,10 +2646,11 @@ class OneFormerTextEncoder(nn.Module):
heads=heads,
attn_mask=self.build_attention_mask(),
use_checkpoint=use_checkpoint,
layer_norm_eps=layer_norm_eps,
)
self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width))
self.ln_final = nn.LayerNorm(width)
self.ln_final = nn.LayerNorm(width, eps=layer_norm_eps)
self.token_embedding = nn.Embedding(vocab_size, width)
def build_attention_mask(self):
@ -2641,6 +2681,7 @@ class OneFormerTextMapper(nn.Module):
width=config.text_encoder_width,
layers=config.text_encoder_num_layers,
vocab_size=config.text_encoder_vocab_size,
layer_norm_eps=config.layer_norm_eps,
)
self.text_projector = OneFormerMLPPredictionHead(

View File

@ -66,8 +66,9 @@ class OwlViTTextConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
defaults to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
dropout (`float`, *optional*, defaults to 0.0):
@ -103,7 +104,7 @@ class OwlViTTextConfig(PretrainedConfig):
num_attention_heads=8,
max_position_embeddings=16,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,
@ -173,8 +174,9 @@ class OwlViTVisionConfig(PretrainedConfig):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
defaults to 1e-5): The epsilon used by the layer normalization layers.
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
@ -212,7 +214,7 @@ class OwlViTVisionConfig(PretrainedConfig):
image_size=768,
patch_size=32,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,

View File

@ -476,9 +476,9 @@ class OwlViTEncoderLayer(nn.Module):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = OwlViTAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = OwlViTMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
@ -790,7 +790,7 @@ class OwlViTTextTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = OwlViTTextEmbeddings(config)
self.encoder = OwlViTEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim)
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(OWLVIT_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTTextConfig)
@ -922,9 +922,9 @@ class OwlViTVisionTransformer(nn.Module):
self.config = config
self.embeddings = OwlViTVisionEmbeddings(config)
self.pre_layernorm = nn.LayerNorm(config.hidden_size)
self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.encoder = OwlViTEncoder(config)
self.post_layernorm = nn.LayerNorm(config.hidden_size)
self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(OWLVIT_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=OwlViTVisionConfig)
@ -1318,7 +1318,7 @@ class OwlViTForObjectDetection(OwlViTPreTrainedModel):
self.class_head = OwlViTClassPredictionHead(config)
self.box_head = OwlViTBoxPredictionHead(config)
self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size)
self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
self.sigmoid = nn.Sigmoid()
def normalize_grid_corner_coordinates(self, feature_map: torch.FloatTensor):

View File

@ -95,7 +95,7 @@ class XCLIPTextConfig(PretrainedConfig):
num_attention_heads=8,
max_position_embeddings=77,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,
@ -220,7 +220,7 @@ class XCLIPVisionConfig(PretrainedConfig):
patch_size=32,
num_frames=8,
hidden_act="quick_gelu",
layer_norm_eps=0.00001,
layer_norm_eps=1e-5,
dropout=0.0,
attention_dropout=0.0,
initializer_range=0.02,

View File

@ -311,9 +311,9 @@ class XCLIPEncoderLayer(nn.Module):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = XCLIPAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = XCLIPMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
@ -403,15 +403,15 @@ class XCLIPVisionEncoderLayer(nn.Module):
self.embed_dim = config.hidden_size
self.message_fc = nn.Linear(self.embed_dim, self.embed_dim)
self.message_ln = nn.LayerNorm(self.embed_dim)
self.message_ln = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.message_attn = XCLIPAttention(config)
self.drop_path = XCLIPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
self.self_attn = XCLIPAttention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = XCLIPMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
@ -744,7 +744,7 @@ class XCLIPTextTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = XCLIPTextEmbeddings(config)
self.encoder = XCLIPEncoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim)
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
@ -989,9 +989,9 @@ class XCLIPVisionTransformer(nn.Module):
embed_dim = config.hidden_size
self.embeddings = XCLIPVisionEmbeddings(config)
self.pre_layernorm = nn.LayerNorm(embed_dim)
self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self.encoder = XCLIPVisionEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
@ -1218,8 +1218,8 @@ class PromptGeneratorLayer(nn.Module):
embed_dim = config.projection_dim
self.cross_attn = XCLIPCrossAttention(config)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm3 = nn.LayerNorm(embed_dim)
self.norm1 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
self.norm3 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
self.mlp = nn.Sequential(
nn.Linear(embed_dim, embed_dim * 4),
ACT2FN[config.prompt_hidden_act],
@ -1239,7 +1239,7 @@ class XCLIPPromptGenerator(nn.Module):
def __init__(self, config):
super().__init__()
embed_dim = config.projection_dim
self.layernorm = nn.LayerNorm(embed_dim)
self.layernorm = nn.LayerNorm(embed_dim, eps=config.vision_config.layer_norm_eps)
self.decoder = nn.ModuleList([PromptGeneratorLayer(config) for _ in range(config.prompt_layers)])
self.alpha = nn.Parameter(torch.ones(embed_dim) * config.prompt_alpha)
@ -1284,7 +1284,7 @@ class XCLIPModel(XCLIPPreTrainedModel):
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim)
self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps)
self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))
mit_config = copy(vision_config)