diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py index 363bbf3ff6d..5b4de28b7ce 100644 --- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py @@ -1041,7 +1041,7 @@ class TFData2VecVisionConvModule(tf.keras.layers.Layer): dilation_rate=dilation, name="conv", ) - self.bn = tf.keras.layers.BatchNormalization(name="bn") + self.bn = tf.keras.layers.BatchNormalization(name="bn", momentum=0.9, epsilon=1e-5) self.activation = tf.nn.relu def call(self, input: tf.Tensor) -> tf.Tensor: @@ -1331,7 +1331,7 @@ class TFData2VecVisionForSemanticSegmentation(TFData2VecVisionPreTrainedModel): # FPNs self.fpn1 = [ tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.0"), - tf.keras.layers.BatchNormalization(name="fpn1.1"), + tf.keras.layers.BatchNormalization(name="fpn1.1", momentum=0.9, epsilon=1e-5), tf.keras.layers.Activation("gelu"), tf.keras.layers.Conv2DTranspose(config.hidden_size, kernel_size=2, strides=2, name="fpn1.3"), ] diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py index 481f065eb6f..f86d6a47e37 100644 --- a/src/transformers/models/groupvit/modeling_tf_groupvit.py +++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py @@ -1253,13 +1253,13 @@ class TFGroupViTMainLayer(tf.keras.layers.Layer): self.visual_projection = [ tf.keras.layers.Dense(self.projection_intermediate_dim, name="visual_projection.0"), - tf.keras.layers.BatchNormalization(name="visual_projection.1", momentum=0.1, epsilon=1e-5), + tf.keras.layers.BatchNormalization(name="visual_projection.1", momentum=0.9, epsilon=1e-5), tf.keras.layers.ReLU(name="visual_projection.2"), tf.keras.layers.Dense(self.projection_dim, name="visual_projection.3"), ] self.text_projection = [ tf.keras.layers.Dense(self.projection_intermediate_dim, name="text_projection.0"), - tf.keras.layers.BatchNormalization(name="text_projection.1", momentum=0.1, epsilon=1e-5), + tf.keras.layers.BatchNormalization(name="text_projection.1", momentum=0.9, epsilon=1e-5), tf.keras.layers.ReLU(name="text_projection.2"), tf.keras.layers.Dense(self.projection_dim, name="text_projection.3"), ] diff --git a/src/transformers/models/regnet/modeling_tf_regnet.py b/src/transformers/models/regnet/modeling_tf_regnet.py index 1d43d6eb7f8..8bbc37951a9 100644 --- a/src/transformers/models/regnet/modeling_tf_regnet.py +++ b/src/transformers/models/regnet/modeling_tf_regnet.py @@ -74,7 +74,7 @@ class TFRegNetConvLayer(tf.keras.layers.Layer): use_bias=False, name="convolution", ) - self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="normalization") + self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") self.activation = ACT2FN[activation] if activation is not None else tf.identity def call(self, hidden_state): @@ -126,7 +126,7 @@ class TFRegNetShortCut(tf.keras.layers.Layer): self.convolution = tf.keras.layers.Conv2D( filters=out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution" ) - self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="normalization") + self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor: return self.normalization(self.convolution(inputs), training=training) diff --git a/src/transformers/models/resnet/modeling_tf_resnet.py b/src/transformers/models/resnet/modeling_tf_resnet.py index bed053ae404..4cf0d21ec77 100644 --- a/src/transformers/models/resnet/modeling_tf_resnet.py +++ b/src/transformers/models/resnet/modeling_tf_resnet.py @@ -60,7 +60,7 @@ class TFResNetConvLayer(tf.keras.layers.Layer): out_channels, kernel_size=kernel_size, strides=stride, padding="valid", use_bias=False, name="convolution" ) # Use same default momentum and epsilon as PyTorch equivalent - self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="normalization") + self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") self.activation = ACT2FN[activation] if activation is not None else tf.keras.layers.Activation("linear") def convolution(self, hidden_state: tf.Tensor) -> tf.Tensor: @@ -119,7 +119,7 @@ class TFResNetShortCut(tf.keras.layers.Layer): out_channels, kernel_size=1, strides=stride, use_bias=False, name="convolution" ) # Use same default momentum and epsilon as PyTorch equivalent - self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="normalization") + self.normalization = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization") def call(self, x: tf.Tensor, training: bool = False) -> tf.Tensor: hidden_state = x diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py index 2ff256d78d2..c6a8e001460 100644 --- a/src/transformers/models/segformer/modeling_tf_segformer.py +++ b/src/transformers/models/segformer/modeling_tf_segformer.py @@ -741,7 +741,7 @@ class TFSegformerDecodeHead(TFSegformerPreTrainedModel): self.linear_fuse = tf.keras.layers.Conv2D( filters=config.decoder_hidden_size, kernel_size=1, use_bias=False, name="linear_fuse" ) - self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="batch_norm") + self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="batch_norm") self.activation = tf.keras.layers.Activation("relu") self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob)