mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Add TensorFlow implementation of EfficientFormer (#22620)
* Add tf code for efficientformer * Fix return dict bug - return last hidden state after last stage * Fix corresponding return dict bug * Override test tol * Change default values of training to False * Set training to default False X3 * Rm axis from ln * Set init in dense projection * Rm debug stuff * Make style; all tests pass. * Modify year to 2023 * Fix attention biases codes * Update the shape list logic * Add a batch norm eps config * Remove extract comments in test files * Add conditional attn and hidden states return for serving output * Change channel dim checking logic * Add exception for withteacher model in training mode * Revert layer count for now * Add layer count for conditional layer naming * Transpose for conv happens only in main layer * Make tests smaller * Make style * Update doc * Rm from_pt * Change to actual expect image class label * Remove stray print in tests * Update image processor test * Remove the old serving output logic * Make style * Make style * Complete test
This commit is contained in:
parent
9fea71b465
commit
88f50a1e89
@ -313,7 +313,7 @@ Flax), PyTorch, and/or TensorFlow.
|
||||
| DonutSwin | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| DPR | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| DPT | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| EfficientFormer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| EfficientFormer | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| EfficientNet | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ELECTRA | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Encoder decoder | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
|
@ -37,7 +37,7 @@ EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work pr
|
||||
reach extremely low latency on mobile devices while maintaining high performance.*
|
||||
|
||||
This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd).
|
||||
The original code can be found [here](https://github.com/snap-research/EfficientFormer).
|
||||
The original code can be found [here](https://github.com/snap-research/EfficientFormer). The TensorFlow version of this model was added by [D-Roberts](https://huggingface.co/D-Roberts).
|
||||
|
||||
## Documentation resources
|
||||
|
||||
@ -66,3 +66,18 @@ The original code can be found [here](https://github.com/snap-research/Efficient
|
||||
|
||||
[[autodoc]] EfficientFormerForImageClassificationWithTeacher
|
||||
- forward
|
||||
|
||||
## TFEfficientFormerModel
|
||||
|
||||
[[autodoc]] TFEfficientFormerModel
|
||||
- call
|
||||
|
||||
## TFEfficientFormerForImageClassification
|
||||
|
||||
[[autodoc]] TFEfficientFormerForImageClassification
|
||||
- call
|
||||
|
||||
## TFEfficientFormerForImageClassificationWithTeacher
|
||||
|
||||
[[autodoc]] TFEfficientFormerForImageClassificationWithTeacher
|
||||
- call
|
||||
|
@ -3142,6 +3142,15 @@ else:
|
||||
"TFDPRReader",
|
||||
]
|
||||
)
|
||||
_import_structure["models.efficientformer"].extend(
|
||||
[
|
||||
"TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"TFEfficientFormerForImageClassification",
|
||||
"TFEfficientFormerForImageClassificationWithTeacher",
|
||||
"TFEfficientFormerModel",
|
||||
"TFEfficientFormerPreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.electra"].extend(
|
||||
[
|
||||
"TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
@ -6471,6 +6480,13 @@ if TYPE_CHECKING:
|
||||
TFDPRQuestionEncoder,
|
||||
TFDPRReader,
|
||||
)
|
||||
from .models.efficientformer import (
|
||||
TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
TFEfficientFormerForImageClassification,
|
||||
TFEfficientFormerForImageClassificationWithTeacher,
|
||||
TFEfficientFormerModel,
|
||||
TFEfficientFormerPreTrainedModel,
|
||||
)
|
||||
from .models.electra import (
|
||||
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
TFElectraForMaskedLM,
|
||||
|
@ -47,6 +47,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("deit", "TFDeiTModel"),
|
||||
("distilbert", "TFDistilBertModel"),
|
||||
("dpr", "TFDPRQuestionEncoder"),
|
||||
("efficientformer", "TFEfficientFormerModel"),
|
||||
("electra", "TFElectraModel"),
|
||||
("esm", "TFEsmModel"),
|
||||
("flaubert", "TFFlaubertModel"),
|
||||
@ -202,6 +203,10 @@ TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
("cvt", "TFCvtForImageClassification"),
|
||||
("data2vec-vision", "TFData2VecVisionForImageClassification"),
|
||||
("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")),
|
||||
(
|
||||
"efficientformer",
|
||||
("TFEfficientFormerForImageClassification", "TFEfficientFormerForImageClassificationWithTeacher"),
|
||||
),
|
||||
("mobilevit", "TFMobileViTForImageClassification"),
|
||||
("regnet", "TFRegNetForImageClassification"),
|
||||
("resnet", "TFResNetForImageClassification"),
|
||||
|
@ -13,7 +13,13 @@
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
|
||||
from ...utils import (
|
||||
OptionalDependencyNotAvailable,
|
||||
_LazyModule,
|
||||
is_tf_available,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
|
||||
|
||||
_import_structure = {
|
||||
@ -45,6 +51,20 @@ else:
|
||||
"EfficientFormerPreTrainedModel",
|
||||
]
|
||||
|
||||
try:
|
||||
if not is_tf_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
pass
|
||||
else:
|
||||
_import_structure["modeling_tf_efficientformer"] = [
|
||||
"TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"TFEfficientFormerForImageClassification",
|
||||
"TFEfficientFormerForImageClassificationWithTeacher",
|
||||
"TFEfficientFormerModel",
|
||||
"TFEfficientFormerPreTrainedModel",
|
||||
]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig
|
||||
|
||||
@ -69,6 +89,19 @@ if TYPE_CHECKING:
|
||||
EfficientFormerModel,
|
||||
EfficientFormerPreTrainedModel,
|
||||
)
|
||||
try:
|
||||
if not is_tf_available():
|
||||
raise OptionalDependencyNotAvailable()
|
||||
except OptionalDependencyNotAvailable:
|
||||
pass
|
||||
else:
|
||||
from .modeling_tf_efficientformer import (
|
||||
TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
TFEfficientFormerForImageClassification,
|
||||
TFEfficientFormerForImageClassificationWithTeacher,
|
||||
TFEfficientFormerModel,
|
||||
TFEfficientFormerPreTrainedModel,
|
||||
)
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
@ -52,7 +52,7 @@ class EfficientFormerConfig(PretrainedConfig):
|
||||
The size of the key in meta3D block.
|
||||
attention_ratio (`int`, *optional*, defaults to 4):
|
||||
Ratio of the dimension of the query and value to the dimension of the key in MSHA block
|
||||
resolution (`int`, *optional*, defaults to 5)
|
||||
resolution (`int`, *optional*, defaults to 7)
|
||||
Size of each patch
|
||||
num_hidden_layers (`int`, *optional*, defaults to 5):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
@ -91,6 +91,8 @@ class EfficientFormerConfig(PretrainedConfig):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
image_size (`int`, *optional*, defaults to `224`):
|
||||
The size (resolution) of each image.
|
||||
|
||||
Example:
|
||||
|
||||
@ -136,6 +138,8 @@ class EfficientFormerConfig(PretrainedConfig):
|
||||
hidden_act: str = "gelu",
|
||||
initializer_range: float = 0.02,
|
||||
layer_norm_eps: float = 1e-12,
|
||||
image_size: int = 224,
|
||||
batch_norm_eps: float = 1e-05,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
@ -165,3 +169,5 @@ class EfficientFormerConfig(PretrainedConfig):
|
||||
self.distillation = distillation
|
||||
self.use_layer_scale = use_layer_scale
|
||||
self.layer_scale_init_value = layer_scale_init_value
|
||||
self.image_size = image_size
|
||||
self.batch_norm_eps = batch_norm_eps
|
||||
|
@ -43,7 +43,7 @@ _CONFIG_FOR_DOC = "EfficientFormerConfig"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
|
||||
_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
|
||||
_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
|
||||
|
||||
# Image classification docstring
|
||||
_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
|
||||
@ -73,7 +73,7 @@ class EfficientFormerPatchEmbeddings(nn.Module):
|
||||
stride=config.downsample_stride,
|
||||
padding=config.downsample_pad,
|
||||
)
|
||||
self.norm = nn.BatchNorm2d(embed_dim) if apply_norm else nn.Identity()
|
||||
self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity()
|
||||
|
||||
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
|
||||
batch_size, num_channels, height, width = pixel_values.shape
|
||||
@ -157,10 +157,10 @@ class EfficientFormerConvStem(nn.Module):
|
||||
super().__init__()
|
||||
|
||||
self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1)
|
||||
self.batchnorm_before = nn.BatchNorm2d(out_channels // 2)
|
||||
self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps)
|
||||
|
||||
self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1)
|
||||
self.batchnorm_after = nn.BatchNorm2d(out_channels)
|
||||
self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
|
||||
|
||||
self.activation = nn.ReLU()
|
||||
|
||||
@ -224,24 +224,24 @@ class EfficientFormerConvMlp(nn.Module):
|
||||
hidden_features = hidden_features or in_features
|
||||
|
||||
self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
|
||||
self.actvation = ACT2FN[config.hidden_act]
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
|
||||
self.dropout = nn.Dropout(drop)
|
||||
|
||||
self.batchnorm_before = nn.BatchNorm2d(hidden_features)
|
||||
self.batchnorm_after = nn.BatchNorm2d(out_features)
|
||||
self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps)
|
||||
self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps)
|
||||
|
||||
def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
|
||||
hidden_state = self.convolution1(hidden_state)
|
||||
hidden_state = self.batchnorm_before(hidden_state)
|
||||
|
||||
hidden_state = self.actvation(hidden_state)
|
||||
hidden_state = self.activation(hidden_state)
|
||||
hidden_state = self.dropout(hidden_state)
|
||||
hidden_state = self.convolution2(hidden_state)
|
||||
|
||||
hidden_state = self.batchnorm_after(hidden_state)
|
||||
|
||||
hidden_state = self.dropout(hidden_state)
|
||||
|
||||
return hidden_state
|
||||
|
||||
|
||||
@ -266,7 +266,7 @@ def drop_path(input, drop_prob: float = 0.0, training: bool = False):
|
||||
return output
|
||||
|
||||
|
||||
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Bit
|
||||
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->EfficientFormer
|
||||
class EfficientFormerDropPath(nn.Module):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
|
||||
|
||||
@ -301,8 +301,10 @@ class EfficientFormerMeta3D(nn.Module):
|
||||
attention_ratio=config.attention_ratio,
|
||||
resolution=config.resolution,
|
||||
)
|
||||
self.layernorm1 = nn.LayerNorm(dim)
|
||||
self.layernorm2 = nn.LayerNorm(dim)
|
||||
|
||||
self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
|
||||
self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
|
||||
|
||||
mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
|
||||
self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim)
|
||||
|
||||
@ -346,15 +348,20 @@ class EfficientFormerMeta3DLayers(nn.Module):
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> Tuple[torch.Tensor]:
|
||||
all_attention_outputs = () if output_attentions else None
|
||||
|
||||
for layer_module in self.blocks:
|
||||
if isinstance(hidden_states, tuple):
|
||||
hidden_states = hidden_states[0]
|
||||
|
||||
hidden_states = layer_module(hidden_states, output_attentions)
|
||||
|
||||
if output_attentions:
|
||||
all_attention_outputs = all_attention_outputs + (hidden_states[1],)
|
||||
|
||||
if output_attentions:
|
||||
outputs = (hidden_states[0],) + all_attention_outputs
|
||||
return outputs
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
@ -379,6 +386,7 @@ class EfficientFormerMeta4D(nn.Module):
|
||||
|
||||
if self.use_layer_scale:
|
||||
layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs)
|
||||
|
||||
layer_output = layer_output + self.drop_path(
|
||||
self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output)
|
||||
)
|
||||
@ -398,6 +406,7 @@ class EfficientFormerMeta4DLayers(nn.Module):
|
||||
drop_paths = [
|
||||
config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
|
||||
]
|
||||
|
||||
self.blocks = nn.ModuleList(
|
||||
[
|
||||
EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
|
||||
@ -446,6 +455,7 @@ class EfficientFormerEncoder(nn.Module):
|
||||
for i in range(num_intermediate_stages)
|
||||
]
|
||||
intermediate_stages = []
|
||||
|
||||
for i in range(num_intermediate_stages):
|
||||
intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
|
||||
if downsamples[i]:
|
||||
@ -475,6 +485,7 @@ class EfficientFormerEncoder(nn.Module):
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
layer_output = self.last_stage(hidden_states, output_attentions=output_attentions)
|
||||
|
||||
if output_attentions:
|
||||
all_self_attentions = all_self_attentions + layer_output[1:]
|
||||
|
||||
@ -482,7 +493,7 @@ class EfficientFormerEncoder(nn.Module):
|
||||
all_hidden_states = all_hidden_states + (layer_output[0],)
|
||||
|
||||
if not return_dict:
|
||||
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
|
||||
return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
|
||||
|
||||
return BaseModelOutput(
|
||||
last_hidden_state=layer_output[0],
|
||||
|
@ -0,0 +1,986 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 Snapchat Research and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" TensorFlow EfficientFormer model."""
|
||||
|
||||
import itertools
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from ...activations_tf import ACT2FN
|
||||
from ...modeling_tf_outputs import (
|
||||
TFBaseModelOutput,
|
||||
TFBaseModelOutputWithPooling,
|
||||
TFImageClassifierOutput,
|
||||
)
|
||||
from ...modeling_tf_utils import (
|
||||
TFPreTrainedModel,
|
||||
TFSequenceClassificationLoss,
|
||||
get_initializer,
|
||||
keras_serializable,
|
||||
unpack_inputs,
|
||||
)
|
||||
from ...tf_utils import shape_list, stable_softmax
|
||||
from ...utils import (
|
||||
ModelOutput,
|
||||
add_code_sample_docstrings,
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
logging,
|
||||
)
|
||||
from .configuration_efficientformer import EfficientFormerConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
# General docstring
|
||||
_CONFIG_FOR_DOC = "EfficientFormerConfig"
|
||||
|
||||
# Base docstring
|
||||
_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
|
||||
_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
|
||||
|
||||
# Image classification docstring
|
||||
_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
|
||||
_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281"
|
||||
|
||||
|
||||
TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
"snap-research/efficientformer-l1-300",
|
||||
# See all EfficientFormer models at https://huggingface.co/models?filter=efficientformer
|
||||
]
|
||||
|
||||
|
||||
class TFEfficientFormerPatchEmbeddings(tf.keras.layers.Layer):
|
||||
"""
|
||||
This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels,
|
||||
height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride]
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True, **kwargs
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.num_channels = num_channels
|
||||
|
||||
self.padding = tf.keras.layers.ZeroPadding2D(padding=config.downsample_pad)
|
||||
self.projection = tf.keras.layers.Conv2D(
|
||||
filters=embed_dim,
|
||||
kernel_size=config.downsample_patch_size,
|
||||
strides=config.downsample_stride,
|
||||
padding="valid",
|
||||
name="projection",
|
||||
)
|
||||
# Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
|
||||
self.norm = (
|
||||
tf.keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm")
|
||||
if apply_norm
|
||||
else tf.identity
|
||||
)
|
||||
|
||||
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
|
||||
tf.debugging.assert_shapes(
|
||||
[(pixel_values, (..., None, None, self.num_channels))],
|
||||
message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
|
||||
)
|
||||
embeddings = self.projection(self.padding(pixel_values))
|
||||
embeddings = self.norm(embeddings, training=training)
|
||||
return embeddings
|
||||
|
||||
|
||||
class TFEfficientFormerSelfAttention(tf.keras.layers.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
key_dim: int,
|
||||
num_heads: int,
|
||||
attention_ratio: int,
|
||||
resolution: int,
|
||||
config: EfficientFormerConfig,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.num_heads = num_heads
|
||||
self.key_dim = key_dim
|
||||
self.attention_ratio = attention_ratio
|
||||
self.scale = key_dim**-0.5
|
||||
self.total_key_dim = key_dim * num_heads
|
||||
self.expanded_key_dim = int(attention_ratio * key_dim)
|
||||
self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
|
||||
hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
|
||||
|
||||
self.qkv = tf.keras.layers.Dense(
|
||||
units=hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
|
||||
)
|
||||
self.projection = tf.keras.layers.Dense(
|
||||
units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
|
||||
)
|
||||
self.resolution = resolution
|
||||
|
||||
def build(self, input_shape: tf.TensorShape) -> None:
|
||||
points = list(itertools.product(range(self.resolution), range(self.resolution)))
|
||||
num_points = len(points)
|
||||
attention_offsets = {}
|
||||
|
||||
idxs = []
|
||||
|
||||
for point_1 in points:
|
||||
for point_2 in points:
|
||||
offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
|
||||
if offset not in attention_offsets:
|
||||
attention_offsets[offset] = len(attention_offsets)
|
||||
idxs.append(attention_offsets[offset])
|
||||
|
||||
self.attention_biases = self.add_weight(
|
||||
shape=(self.num_heads, len(attention_offsets)),
|
||||
initializer=tf.keras.initializers.zeros(),
|
||||
trainable=True,
|
||||
name="attention_biases",
|
||||
)
|
||||
self.attention_bias_idxs = self.add_weight(
|
||||
shape=(num_points, num_points),
|
||||
trainable=False,
|
||||
dtype=tf.int32,
|
||||
name="attention_bias_idxs",
|
||||
)
|
||||
|
||||
self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points)))
|
||||
|
||||
super().build(input_shape)
|
||||
|
||||
def call(
|
||||
self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
|
||||
) -> Tuple[tf.Tensor]:
|
||||
batch_size, sequence_length, *_ = shape_list(hidden_states)
|
||||
qkv = self.qkv(inputs=hidden_states)
|
||||
|
||||
query_layer, key_layer, value_layer = tf.split(
|
||||
tf.reshape(tensor=qkv, shape=(batch_size, sequence_length, self.num_heads, -1)),
|
||||
num_or_size_splits=[self.key_dim, self.key_dim, self.expanded_key_dim],
|
||||
axis=3,
|
||||
)
|
||||
|
||||
query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3])
|
||||
key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3])
|
||||
value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3])
|
||||
|
||||
attention_probs = tf.matmul(query_layer, tf.transpose(key_layer, perm=[0, 1, 3, 2]))
|
||||
scale = tf.cast(self.scale, dtype=attention_probs.dtype)
|
||||
attention_probs = tf.multiply(attention_probs, scale)
|
||||
|
||||
attention_biases = tf.gather(params=self.attention_biases, indices=self.attention_bias_idxs, axis=1)
|
||||
attention_probs = attention_probs + attention_biases
|
||||
attention_probs = stable_softmax(logits=attention_probs, axis=-1)
|
||||
|
||||
context_layer = tf.matmul(attention_probs, value_layer)
|
||||
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
|
||||
|
||||
context_layer = tf.reshape(
|
||||
tensor=context_layer, shape=(batch_size, sequence_length, self.total_expanded_key_dim)
|
||||
)
|
||||
context_layer = self.projection(context_layer)
|
||||
|
||||
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class TFEfficientFormerConvStem(tf.keras.layers.Layer):
|
||||
def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.padding = tf.keras.layers.ZeroPadding2D(padding=1)
|
||||
self.convolution1 = tf.keras.layers.Conv2D(
|
||||
filters=out_channels // 2, kernel_size=3, strides=2, padding="valid", name="convolution1"
|
||||
)
|
||||
# Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
|
||||
self.batchnorm_before = tf.keras.layers.BatchNormalization(
|
||||
axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
|
||||
)
|
||||
|
||||
self.convolution2 = tf.keras.layers.Conv2D(
|
||||
filters=out_channels,
|
||||
kernel_size=3,
|
||||
strides=2,
|
||||
padding="valid",
|
||||
name="convolution2",
|
||||
)
|
||||
# Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
|
||||
self.batchnorm_after = tf.keras.layers.BatchNormalization(
|
||||
axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
|
||||
)
|
||||
|
||||
self.activation = tf.keras.layers.Activation(activation=tf.keras.activations.relu, name="activation")
|
||||
|
||||
def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
|
||||
features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training)
|
||||
features = self.activation(features)
|
||||
features = self.batchnorm_after(self.convolution2(self.padding(features)), training=training)
|
||||
features = self.activation(features)
|
||||
return features
|
||||
|
||||
|
||||
class TFEfficientFormerPooling(tf.keras.layers.Layer):
|
||||
def __init__(self, pool_size: int, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.pool = tf.keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same")
|
||||
|
||||
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
|
||||
output = self.pool(hidden_states)
|
||||
output = output - hidden_states
|
||||
return output
|
||||
|
||||
|
||||
class TFEfficientFormerDenseMlp(tf.keras.layers.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
config: EfficientFormerConfig,
|
||||
in_features: int,
|
||||
hidden_features: Optional[int] = None,
|
||||
out_features: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
|
||||
self.linear_in = tf.keras.layers.Dense(
|
||||
units=hidden_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_in"
|
||||
)
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
|
||||
|
||||
self.linear_out = tf.keras.layers.Dense(
|
||||
units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out"
|
||||
)
|
||||
|
||||
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
|
||||
hidden_states = self.linear_in(inputs=hidden_states)
|
||||
hidden_states = self.activation(hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
hidden_states = self.linear_out(inputs=hidden_states)
|
||||
hidden_states = self.dropout(inputs=hidden_states, training=training)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class TFEfficientFormerConvMlp(tf.keras.layers.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
config: EfficientFormerConfig,
|
||||
in_features: int,
|
||||
hidden_features: Optional[int] = None,
|
||||
out_features: Optional[int] = None,
|
||||
drop: float = 0.0,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
|
||||
self.convolution1 = tf.keras.layers.Conv2D(
|
||||
filters=hidden_features,
|
||||
kernel_size=1,
|
||||
name="convolution1",
|
||||
padding="valid",
|
||||
)
|
||||
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
|
||||
self.convolution2 = tf.keras.layers.Conv2D(
|
||||
filters=out_features,
|
||||
kernel_size=1,
|
||||
name="convolution2",
|
||||
padding="valid",
|
||||
)
|
||||
|
||||
self.dropout = tf.keras.layers.Dropout(rate=drop)
|
||||
|
||||
# Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
|
||||
self.batchnorm_before = tf.keras.layers.BatchNormalization(
|
||||
axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
|
||||
)
|
||||
# Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
|
||||
self.batchnorm_after = tf.keras.layers.BatchNormalization(
|
||||
axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
|
||||
)
|
||||
|
||||
def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
|
||||
hidden_state = self.convolution1(hidden_state)
|
||||
hidden_state = self.batchnorm_before(hidden_state, training=training)
|
||||
hidden_state = self.activation(hidden_state)
|
||||
hidden_state = self.dropout(hidden_state, training=training)
|
||||
hidden_state = self.convolution2(hidden_state)
|
||||
hidden_state = self.batchnorm_after(hidden_state, training=training)
|
||||
hidden_state = self.dropout(hidden_state, training=training)
|
||||
return hidden_state
|
||||
|
||||
|
||||
# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer
|
||||
class TFEfficientFormerDropPath(tf.keras.layers.Layer):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
References:
|
||||
(1) github.com:rwightman/pytorch-image-models
|
||||
"""
|
||||
|
||||
def __init__(self, drop_path, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.drop_path = drop_path
|
||||
|
||||
def call(self, x, training=None):
|
||||
if training:
|
||||
keep_prob = 1 - self.drop_path
|
||||
shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
|
||||
random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
|
||||
random_tensor = tf.floor(random_tensor)
|
||||
return (x / keep_prob) * random_tensor
|
||||
return x
|
||||
|
||||
|
||||
class TFEfficientFormerFlat(tf.keras.layers.Layer):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def call(self, hidden_states: tf.Tensor) -> Tuple[tf.Tensor]:
|
||||
batch_size, _, _, in_channels = shape_list(hidden_states)
|
||||
hidden_states = tf.reshape(hidden_states, shape=[batch_size, -1, in_channels])
|
||||
return hidden_states
|
||||
|
||||
|
||||
class TFEfficientFormerMeta3D(tf.keras.layers.Layer):
|
||||
def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.token_mixer = TFEfficientFormerSelfAttention(
|
||||
dim=config.dim,
|
||||
key_dim=config.key_dim,
|
||||
num_heads=config.num_attention_heads,
|
||||
attention_ratio=config.attention_ratio,
|
||||
resolution=config.resolution,
|
||||
name="token_mixer",
|
||||
config=config,
|
||||
)
|
||||
self.dim = dim
|
||||
self.config = config
|
||||
|
||||
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1")
|
||||
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2")
|
||||
mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
|
||||
self.mlp = TFEfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim, name="mlp")
|
||||
|
||||
# Using `layers.Activation` instead of `tf.identity` to better control `training' behavior.
|
||||
self.drop_path = (
|
||||
TFEfficientFormerDropPath(drop_path)
|
||||
if drop_path > 0.0
|
||||
else tf.keras.layers.Activation("linear", name="drop_path")
|
||||
)
|
||||
self.config = config
|
||||
|
||||
def build(self, input_shape: tf.TensorShape):
|
||||
self.layer_scale_1 = None
|
||||
self.layer_scale_2 = None
|
||||
|
||||
if self.config.use_layer_scale:
|
||||
self.layer_scale_1 = self.add_weight(
|
||||
shape=(self.dim,),
|
||||
initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
|
||||
trainable=True,
|
||||
name="layer_scale_1",
|
||||
)
|
||||
self.layer_scale_2 = self.add_weight(
|
||||
shape=(self.dim,),
|
||||
initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
|
||||
trainable=True,
|
||||
name="layer_scale_2",
|
||||
)
|
||||
super().build(input_shape)
|
||||
|
||||
def call(
|
||||
self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
|
||||
) -> Tuple[tf.Tensor]:
|
||||
self_attention_outputs = self.token_mixer(
|
||||
hidden_states=self.layernorm1(hidden_states, training=training),
|
||||
output_attentions=output_attentions,
|
||||
training=training,
|
||||
)
|
||||
|
||||
attention_output = self_attention_outputs[0]
|
||||
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
|
||||
|
||||
if self.config.use_layer_scale:
|
||||
layer_output = hidden_states + self.drop_path(
|
||||
tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * attention_output,
|
||||
training=training,
|
||||
)
|
||||
layer_output = layer_output + self.drop_path(
|
||||
tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
|
||||
* self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
|
||||
training=training,
|
||||
)
|
||||
else:
|
||||
layer_output = hidden_states + self.drop_path(attention_output, training=training)
|
||||
layer_output = layer_output + self.drop_path(
|
||||
self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
|
||||
training=training,
|
||||
)
|
||||
|
||||
outputs = (layer_output,) + outputs
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class TFEfficientFormerMeta3DLayers(tf.keras.layers.Layer):
|
||||
def __init__(self, config: EfficientFormerConfig, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
drop_paths = [
|
||||
config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
|
||||
for block_idx in range(config.num_meta3d_blocks)
|
||||
]
|
||||
self.blocks = [
|
||||
TFEfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path, name=f"blocks.{i}")
|
||||
for i, drop_path in enumerate(drop_paths)
|
||||
]
|
||||
|
||||
def call(
|
||||
self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
|
||||
) -> Tuple[tf.Tensor]:
|
||||
all_attention_outputs = () if output_attentions else None
|
||||
|
||||
for i, layer_module in enumerate(self.blocks):
|
||||
if isinstance(hidden_states, tuple):
|
||||
hidden_states = hidden_states[0]
|
||||
|
||||
hidden_states = layer_module(
|
||||
hidden_states=hidden_states, output_attentions=output_attentions, training=training
|
||||
)
|
||||
if output_attentions:
|
||||
all_attention_outputs = all_attention_outputs + (hidden_states[1],)
|
||||
|
||||
if output_attentions:
|
||||
outputs = (hidden_states[0],) + all_attention_outputs
|
||||
return outputs
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class TFEfficientFormerMeta4D(tf.keras.layers.Layer):
|
||||
def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
pool_size = config.pool_size if config.pool_size is not None else 3
|
||||
self.token_mixer = TFEfficientFormerPooling(pool_size=pool_size, name="token_mixer")
|
||||
self.dim = dim
|
||||
mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
|
||||
self.mlp = TFEfficientFormerConvMlp(
|
||||
config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob, name="mlp"
|
||||
)
|
||||
|
||||
self.drop_path = (
|
||||
TFEfficientFormerDropPath(drop_path, name="drop_path")
|
||||
if drop_path > 0.0
|
||||
else tf.keras.layers.Activation("linear", name="drop_path")
|
||||
)
|
||||
self.config = config
|
||||
|
||||
def build(self, input_shape: tf.TensorShape):
|
||||
self.layer_scale_1 = None
|
||||
self.layer_scale_2 = None
|
||||
|
||||
if self.config.use_layer_scale:
|
||||
self.layer_scale_1 = self.add_weight(
|
||||
shape=(self.dim),
|
||||
initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
|
||||
trainable=True,
|
||||
name="layer_scale_1",
|
||||
)
|
||||
self.layer_scale_2 = self.add_weight(
|
||||
shape=(self.dim),
|
||||
initializer=tf.keras.initializers.Constant(value=self.config.layer_scale_init_value),
|
||||
trainable=True,
|
||||
name="layer_scale_2",
|
||||
)
|
||||
super().build(input_shape)
|
||||
|
||||
def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
|
||||
outputs = self.token_mixer(hidden_states)
|
||||
|
||||
if self.config.use_layer_scale:
|
||||
layer_output = hidden_states + self.drop_path(
|
||||
tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * outputs,
|
||||
training=training,
|
||||
)
|
||||
|
||||
layer_output = layer_output + self.drop_path(
|
||||
tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
|
||||
* self.mlp(hidden_state=layer_output, training=training),
|
||||
training=training,
|
||||
)
|
||||
|
||||
else:
|
||||
layer_output = hidden_states + self.drop_path(outputs, training=training)
|
||||
layer_output = layer_output + self.drop_path(
|
||||
self.mlp(hidden_state=layer_output, training=training), training=training
|
||||
)
|
||||
|
||||
return layer_output
|
||||
|
||||
|
||||
class TFEfficientFormerMeta4DLayers(tf.keras.layers.Layer):
|
||||
def __init__(self, config: EfficientFormerConfig, stage_idx: int, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
num_layers = (
|
||||
config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
|
||||
)
|
||||
drop_paths = [
|
||||
config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
|
||||
]
|
||||
|
||||
self.blocks = [
|
||||
TFEfficientFormerMeta4D(
|
||||
config=config, dim=config.hidden_sizes[stage_idx], drop_path=drop_paths[i], name=f"blocks.{i}"
|
||||
)
|
||||
for i in range(len(drop_paths))
|
||||
]
|
||||
|
||||
def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
|
||||
for layer_module in self.blocks:
|
||||
hidden_states = layer_module(hidden_states=hidden_states, training=training)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class TFEfficientFormerIntermediateStage(tf.keras.layers.Layer):
|
||||
def __init__(self, config: EfficientFormerConfig, index: int, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=index, name="meta4D_layers")
|
||||
|
||||
def call(self, hidden_states: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor]:
|
||||
hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class TFEfficientFormerLastStage(tf.keras.layers.Layer):
|
||||
def __init__(self, config: EfficientFormerConfig, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=-1, name="meta4D_layers")
|
||||
self.flat = TFEfficientFormerFlat(name="flat")
|
||||
self.meta3D_layers = TFEfficientFormerMeta3DLayers(config, name="meta3D_layers")
|
||||
|
||||
def call(
|
||||
self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
|
||||
) -> Tuple[tf.Tensor]:
|
||||
hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
|
||||
hidden_states = self.flat(hidden_states=hidden_states)
|
||||
hidden_states = self.meta3D_layers(
|
||||
hidden_states=hidden_states, output_attentions=output_attentions, training=training
|
||||
)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class TFEfficientFormerEncoder(tf.keras.layers.Layer):
|
||||
def __init__(self, config: EfficientFormerConfig, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.config = config
|
||||
num_intermediate_stages = len(config.depths) - 1
|
||||
downsamples = [
|
||||
config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
|
||||
for i in range(num_intermediate_stages)
|
||||
]
|
||||
|
||||
intermediate_stages = []
|
||||
layer_count = -1
|
||||
for i in range(num_intermediate_stages):
|
||||
layer_count += 1
|
||||
intermediate_stages.append(
|
||||
TFEfficientFormerIntermediateStage(config, i, name=f"intermediate_stages.{layer_count}")
|
||||
)
|
||||
if downsamples[i]:
|
||||
layer_count += 1
|
||||
intermediate_stages.append(
|
||||
TFEfficientFormerPatchEmbeddings(
|
||||
config,
|
||||
config.hidden_sizes[i],
|
||||
config.hidden_sizes[i + 1],
|
||||
name=f"intermediate_stages.{layer_count}",
|
||||
)
|
||||
)
|
||||
self.intermediate_stages = intermediate_stages
|
||||
self.last_stage = TFEfficientFormerLastStage(config, name="last_stage")
|
||||
|
||||
def call(
|
||||
self,
|
||||
hidden_states: tf.Tensor,
|
||||
output_hidden_states: bool,
|
||||
output_attentions: bool,
|
||||
return_dict: bool,
|
||||
training: bool = False,
|
||||
) -> TFBaseModelOutput:
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attentions = () if output_attentions else None
|
||||
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
for layer_module in self.intermediate_stages:
|
||||
hidden_states = layer_module(hidden_states, training=training)
|
||||
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
layer_output = self.last_stage(hidden_states, output_attentions=output_attentions, training=training)
|
||||
|
||||
if output_attentions:
|
||||
all_self_attentions = all_self_attentions + layer_output[1:]
|
||||
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (layer_output[0],)
|
||||
|
||||
if not return_dict:
|
||||
return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
|
||||
|
||||
return TFBaseModelOutput(
|
||||
last_hidden_state=layer_output[0],
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attentions,
|
||||
)
|
||||
|
||||
|
||||
@keras_serializable
|
||||
class TFEfficientFormerMainLayer(tf.keras.layers.Layer):
|
||||
config_class = EfficientFormerConfig
|
||||
|
||||
def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.config = config
|
||||
|
||||
self.patch_embed = TFEfficientFormerConvStem(config, config.hidden_sizes[0], name="patch_embed")
|
||||
self.encoder = TFEfficientFormerEncoder(config, name="encoder")
|
||||
self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
|
||||
|
||||
@unpack_inputs
|
||||
def call(
|
||||
self,
|
||||
pixel_values: Optional[tf.Tensor] = None,
|
||||
output_attentions: Optional[tf.Tensor] = None,
|
||||
output_hidden_states: Optional[tf.Tensor] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
training: bool = False,
|
||||
) -> Union[TFBaseModelOutput, Tuple[tf.Tensor, ...]]:
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if pixel_values is None:
|
||||
raise ValueError("You have to specify pixel_values")
|
||||
|
||||
# When running on CPU, tf.keras.layers.Conv2D and tf.keras.layers.AveragePool2D do not
|
||||
# support channels first NCHW format. A number of blocks contain both.
|
||||
# So change the input format from (batch_size, num_channels, height, width) to
|
||||
# (batch_size, height, width, num_channels) here.
|
||||
# shape = (batch_size, in_height, in_width, in_channels=num_channels)
|
||||
pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
|
||||
embedding_output = self.patch_embed(pixel_values, training=training)
|
||||
|
||||
encoder_outputs = self.encoder(
|
||||
hidden_states=embedding_output,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
training=training,
|
||||
)
|
||||
|
||||
sequence_output = encoder_outputs[0]
|
||||
sequence_output = self.layernorm(sequence_output, training=training)
|
||||
|
||||
# Change the hidden states from (batch_size, height, width, num_channels) to
|
||||
# (batch_size, num_channels, height, width).
|
||||
# The hidden states are in (batch_size, height, width, num_channels)
|
||||
# shape after all stages except the MB3D blocks.
|
||||
if output_hidden_states:
|
||||
hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1][:-1]]) + (
|
||||
encoder_outputs[1][-1],
|
||||
)
|
||||
|
||||
if not return_dict:
|
||||
head_outputs = (sequence_output,)
|
||||
return head_outputs + encoder_outputs[1:]
|
||||
|
||||
return TFBaseModelOutput(
|
||||
last_hidden_state=sequence_output,
|
||||
hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
|
||||
attentions=encoder_outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
class TFEfficientFormerPreTrainedModel(TFPreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = EfficientFormerConfig
|
||||
base_model_prefix = "efficientformer"
|
||||
main_input_name = "pixel_values"
|
||||
|
||||
|
||||
EFFICIENTFORMER_START_DOCSTRING = r"""
|
||||
This model is a TensorFlow
|
||||
[tf.keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular
|
||||
TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior.
|
||||
|
||||
|
||||
Parameters:
|
||||
config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
|
||||
EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
pixel_values ((`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
|
||||
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
|
||||
[`EfficientFormerImageProcessor.__call__`] for details.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
EFFICIENTFORMER_START_DOCSTRING,
|
||||
)
|
||||
class TFEfficientFormerModel(TFEfficientFormerPreTrainedModel):
|
||||
def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
|
||||
super().__init__(config, **kwargs)
|
||||
|
||||
self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
|
||||
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TFBaseModelOutputWithPooling,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
modality="vision",
|
||||
expected_output=_EXPECTED_OUTPUT_SHAPE,
|
||||
)
|
||||
def call(
|
||||
self,
|
||||
pixel_values: Optional[tf.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
training: bool = False,
|
||||
) -> Union[Tuple, TFBaseModelOutput]:
|
||||
outputs = self.efficientformer(
|
||||
pixel_values=pixel_values,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
training=training,
|
||||
)
|
||||
return outputs
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
EfficientFormer Model transformer with an image classification head on top of pooled last hidden state, e.g. for
|
||||
ImageNet.
|
||||
""",
|
||||
EFFICIENTFORMER_START_DOCSTRING,
|
||||
)
|
||||
class TFEfficientFormerForImageClassification(TFEfficientFormerPreTrainedModel, TFSequenceClassificationLoss):
|
||||
def __init__(self, config: EfficientFormerConfig):
|
||||
super().__init__(config)
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
|
||||
|
||||
# Classifier head
|
||||
self.classifier = (
|
||||
tf.keras.layers.Dense(config.num_labels, name="classifier")
|
||||
if config.num_labels > 0
|
||||
else tf.keras.layers.Activation("linear", name="classifier")
|
||||
)
|
||||
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=TFImageClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
|
||||
)
|
||||
def call(
|
||||
self,
|
||||
pixel_values: Optional[tf.Tensor] = None,
|
||||
labels: Optional[tf.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
training: bool = False,
|
||||
) -> Union[tf.Tensor, TFImageClassifierOutput]:
|
||||
r"""
|
||||
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.efficientformer(
|
||||
pixel_values=pixel_values,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
training=training,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
|
||||
|
||||
loss = None if labels is None else self.hf_compute_loss(labels, logits)
|
||||
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return TFImageClassifierOutput(
|
||||
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TFEfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
|
||||
"""
|
||||
Args:
|
||||
Output type of [`EfficientFormerForImageClassificationWithTeacher`].
|
||||
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores as the average of the cls_logits and distillation logits.
|
||||
cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
|
||||
class token).
|
||||
distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
|
||||
Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
|
||||
distillation token).
|
||||
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
|
||||
`config.output_hidden_states=True`):
|
||||
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
|
||||
`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
|
||||
the initial embedding outputs.
|
||||
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
|
||||
`config.output_attentions=True`):
|
||||
Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
||||
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
|
||||
the self-attention heads.
|
||||
"""
|
||||
|
||||
logits: tf.Tensor = None
|
||||
cls_logits: tf.Tensor = None
|
||||
distillation_logits: tf.Tensor = None
|
||||
hidden_states: Optional[Tuple[tf.Tensor]] = None
|
||||
attentions: Optional[Tuple[tf.Tensor]] = None
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
|
||||
state and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.
|
||||
|
||||
.. warning::
|
||||
This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
|
||||
supported.
|
||||
""",
|
||||
EFFICIENTFORMER_START_DOCSTRING,
|
||||
)
|
||||
class TFEfficientFormerForImageClassificationWithTeacher(TFEfficientFormerPreTrainedModel):
|
||||
def __init__(self, config: EfficientFormerConfig) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
|
||||
|
||||
# Classifier heads
|
||||
self.classifier = (
|
||||
tf.keras.layers.Dense(config.num_labels, name="classifier")
|
||||
if config.num_labels > 0
|
||||
else tf.keras.layers.Activation("linear", name="classifier")
|
||||
)
|
||||
self.distillation_classifier = (
|
||||
tf.keras.layers.Dense(config.num_labels, name="distillation_classifier")
|
||||
if config.num_labels > 0
|
||||
else tf.keras.layers.Activation("linear", name="distillation_classifier")
|
||||
)
|
||||
|
||||
@unpack_inputs
|
||||
@add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
|
||||
@add_code_sample_docstrings(
|
||||
checkpoint=_IMAGE_CLASS_CHECKPOINT,
|
||||
output_type=TFEfficientFormerForImageClassificationWithTeacherOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
|
||||
)
|
||||
def call(
|
||||
self,
|
||||
pixel_values: Optional[tf.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
training: bool = False,
|
||||
) -> Union[tuple, TFEfficientFormerForImageClassificationWithTeacherOutput]:
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if training:
|
||||
raise Exception(
|
||||
"This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet supported."
|
||||
)
|
||||
|
||||
outputs = self.efficientformer(
|
||||
pixel_values=pixel_values,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
training=training,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
cls_logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
|
||||
distillation_logits = self.distillation_classifier(tf.reduce_mean(sequence_output, axis=-2))
|
||||
logits = (cls_logits + distillation_logits) / 2
|
||||
|
||||
if not return_dict:
|
||||
output = (logits, cls_logits, distillation_logits) + outputs[1:]
|
||||
return output
|
||||
|
||||
return TFEfficientFormerForImageClassificationWithTeacherOutput(
|
||||
logits=logits,
|
||||
cls_logits=cls_logits,
|
||||
distillation_logits=distillation_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
@ -1099,6 +1099,37 @@ class TFDPRReader(metaclass=DummyObject):
|
||||
requires_backends(self, ["tf"])
|
||||
|
||||
|
||||
TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
class TFEfficientFormerForImageClassification(metaclass=DummyObject):
|
||||
_backends = ["tf"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["tf"])
|
||||
|
||||
|
||||
class TFEfficientFormerForImageClassificationWithTeacher(metaclass=DummyObject):
|
||||
_backends = ["tf"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["tf"])
|
||||
|
||||
|
||||
class TFEfficientFormerModel(metaclass=DummyObject):
|
||||
_backends = ["tf"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["tf"])
|
||||
|
||||
|
||||
class TFEfficientFormerPreTrainedModel(metaclass=DummyObject):
|
||||
_backends = ["tf"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["tf"])
|
||||
|
||||
|
||||
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
|
@ -18,6 +18,7 @@
|
||||
import inspect
|
||||
import unittest
|
||||
import warnings
|
||||
from typing import List
|
||||
|
||||
from transformers import EfficientFormerConfig
|
||||
from transformers.models.auto import get_values
|
||||
@ -55,15 +56,16 @@ class EfficientFormerModelTester:
|
||||
self,
|
||||
parent,
|
||||
batch_size: int = 13,
|
||||
image_size: int = 224,
|
||||
image_size: int = 64,
|
||||
patch_size: int = 2,
|
||||
embed_dim: int = 48, # last embed dim of stem
|
||||
embed_dim: int = 3,
|
||||
num_channels: int = 3,
|
||||
is_training: bool = True,
|
||||
use_labels: bool = True,
|
||||
hidden_size: int = 448,
|
||||
num_hidden_layers: int = 7, # For the l1
|
||||
num_attention_heads: int = 8,
|
||||
hidden_size: int = 128,
|
||||
hidden_sizes=[16, 32, 64, 128],
|
||||
num_hidden_layers: int = 7,
|
||||
num_attention_heads: int = 4,
|
||||
intermediate_size: int = 37,
|
||||
hidden_act: str = "gelu",
|
||||
hidden_dropout_prob: float = 0.1,
|
||||
@ -71,7 +73,11 @@ class EfficientFormerModelTester:
|
||||
type_sequence_label_size: int = 10,
|
||||
initializer_range: float = 0.02,
|
||||
encoder_stride: int = 2,
|
||||
num_attention_outputs: int = 1, # For l1
|
||||
num_attention_outputs: int = 1,
|
||||
dim: int = 128,
|
||||
depths: List[int] = [2, 2, 2, 2],
|
||||
resolution: int = 2,
|
||||
mlp_expansion_ratio: int = 2,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
@ -93,6 +99,11 @@ class EfficientFormerModelTester:
|
||||
self.num_attention_outputs = num_attention_outputs
|
||||
self.embed_dim = embed_dim
|
||||
self.seq_length = embed_dim + 1
|
||||
self.resolution = resolution
|
||||
self.depths = depths
|
||||
self.hidden_sizes = hidden_sizes
|
||||
self.dim = dim
|
||||
self.mlp_expansion_ratio = mlp_expansion_ratio
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
|
||||
@ -119,6 +130,11 @@ class EfficientFormerModelTester:
|
||||
is_decoder=False,
|
||||
initializer_range=self.initializer_range,
|
||||
encoder_stride=self.encoder_stride,
|
||||
resolution=self.resolution,
|
||||
depths=self.depths,
|
||||
hidden_sizes=self.hidden_sizes,
|
||||
dim=self.dim,
|
||||
mlp_expansion_ratio=self.mlp_expansion_ratio,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, pixel_values, labels):
|
||||
@ -379,6 +395,7 @@ class EfficientFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T
|
||||
encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
|
||||
encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
|
||||
chunk_length = getattr(self.model_tester, "chunk_length", None)
|
||||
|
||||
if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
|
||||
encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
|
||||
|
||||
|
393
tests/models/efficientformer/test_modeling_tf_efficientformer.py
Normal file
393
tests/models/efficientformer/test_modeling_tf_efficientformer.py
Normal file
@ -0,0 +1,393 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the TensorFlow EfficientFormer model. """
|
||||
|
||||
import inspect
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers import EfficientFormerConfig
|
||||
from transformers.testing_utils import require_tf, require_vision, slow
|
||||
from transformers.utils import cached_property, is_tf_available, is_vision_available
|
||||
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
|
||||
from ...test_pipeline_mixin import PipelineTesterMixin
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
import tensorflow as tf
|
||||
|
||||
from transformers import (
|
||||
TFEfficientFormerForImageClassification,
|
||||
TFEfficientFormerForImageClassificationWithTeacher,
|
||||
TFEfficientFormerModel,
|
||||
)
|
||||
from transformers.models.efficientformer.modeling_tf_efficientformer import (
|
||||
TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
)
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import EfficientFormerImageProcessor
|
||||
|
||||
|
||||
class TFEfficientFormerModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size: int = 13,
|
||||
image_size: int = 64,
|
||||
patch_size: int = 2,
|
||||
embed_dim: int = 3,
|
||||
num_channels: int = 3,
|
||||
is_training: bool = True,
|
||||
use_labels: bool = True,
|
||||
hidden_size: int = 128,
|
||||
hidden_sizes=[16, 32, 64, 128],
|
||||
num_hidden_layers: int = 7,
|
||||
num_attention_heads: int = 4,
|
||||
intermediate_size: int = 37,
|
||||
hidden_act: str = "gelu",
|
||||
hidden_dropout_prob: float = 0.1,
|
||||
attention_probs_dropout_prob: float = 0.1,
|
||||
type_sequence_label_size: int = 10,
|
||||
initializer_range: float = 0.02,
|
||||
encoder_stride: int = 2,
|
||||
num_attention_outputs: int = 1,
|
||||
dim: int = 128,
|
||||
depths: List[int] = [2, 2, 2, 2],
|
||||
resolution: int = 2,
|
||||
mlp_expansion_ratio: int = 2,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.num_channels = num_channels
|
||||
self.is_training = is_training
|
||||
self.use_labels = use_labels
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.encoder_stride = encoder_stride
|
||||
self.num_attention_outputs = num_attention_outputs
|
||||
self.embed_dim = embed_dim
|
||||
self.seq_length = embed_dim + 1
|
||||
self.resolution = resolution
|
||||
self.depths = depths
|
||||
self.hidden_sizes = hidden_sizes
|
||||
self.dim = dim
|
||||
self.mlp_expansion_ratio = mlp_expansion_ratio
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
|
||||
|
||||
labels = None
|
||||
if self.use_labels:
|
||||
labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return config, pixel_values, labels
|
||||
|
||||
def get_config(self):
|
||||
return EfficientFormerConfig(
|
||||
image_size=self.image_size,
|
||||
patch_size=self.patch_size,
|
||||
num_channels=self.num_channels,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
intermediate_size=self.intermediate_size,
|
||||
hidden_act=self.hidden_act,
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
is_decoder=False,
|
||||
initializer_range=self.initializer_range,
|
||||
encoder_stride=self.encoder_stride,
|
||||
resolution=self.resolution,
|
||||
depths=self.depths,
|
||||
hidden_sizes=self.hidden_sizes,
|
||||
dim=self.dim,
|
||||
mlp_expansion_ratio=self.mlp_expansion_ratio,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, pixel_values, labels):
|
||||
model = TFEfficientFormerModel(config=config)
|
||||
result = model(pixel_values, training=False)
|
||||
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
|
||||
|
||||
def create_and_check_for_image_classification(self, config, pixel_values, labels):
|
||||
config.num_labels = self.type_sequence_label_size
|
||||
model = TFEfficientFormerForImageClassification(config)
|
||||
result = model(pixel_values, labels=labels, training=False)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
|
||||
|
||||
# test greyscale images
|
||||
config.num_channels = 1
|
||||
model = TFEfficientFormerForImageClassification(config)
|
||||
|
||||
pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
|
||||
result = model(pixel_values, labels=labels)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, pixel_values, labels = config_and_inputs
|
||||
inputs_dict = {"pixel_values": pixel_values}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_tf
|
||||
class TFEfficientFormerModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
"""
|
||||
Here we also overwrite some of the tests of test_modeling_tf_common.py, as EfficientFormer does not use input_ids,
|
||||
inputs_embeds, attention_mask and seq_length.
|
||||
"""
|
||||
|
||||
all_model_classes = (
|
||||
(
|
||||
TFEfficientFormerModel,
|
||||
TFEfficientFormerForImageClassificationWithTeacher,
|
||||
TFEfficientFormerForImageClassification,
|
||||
)
|
||||
if is_tf_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
"feature-extraction": TFEfficientFormerModel,
|
||||
"image-classification": (
|
||||
TFEfficientFormerForImageClassification,
|
||||
TFEfficientFormerForImageClassificationWithTeacher,
|
||||
),
|
||||
}
|
||||
if is_tf_available()
|
||||
else {}
|
||||
)
|
||||
|
||||
fx_compatible = False
|
||||
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_head_masking = False
|
||||
test_onnx = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = TFEfficientFormerModelTester(self)
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="EfficientFormer does not use inputs_embeds")
|
||||
def test_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="EfficientFormer does not support input and output embeddings")
|
||||
def test_model_common_attributes(self):
|
||||
pass
|
||||
|
||||
def test_forward_signature(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
signature = inspect.signature(model.call)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = ["pixel_values"]
|
||||
self.assertListEqual(arg_names[:1], expected_arg_names)
|
||||
|
||||
def test_hidden_states_output(self):
|
||||
def check_hidden_states_output(inputs_dict, config, model_class):
|
||||
model = model_class(config)
|
||||
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
|
||||
hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
|
||||
|
||||
expected_num_layers = getattr(
|
||||
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
|
||||
)
|
||||
self.assertEqual(len(hidden_states), expected_num_layers)
|
||||
|
||||
if hasattr(self.model_tester, "encoder_seq_length"):
|
||||
seq_length = self.model_tester.encoder_seq_length
|
||||
if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
|
||||
seq_length = seq_length * self.model_tester.chunk_length
|
||||
else:
|
||||
seq_length = self.model_tester.seq_length
|
||||
|
||||
self.assertListEqual(
|
||||
list(hidden_states[-1].shape[-2:]),
|
||||
[seq_length, self.model_tester.hidden_size],
|
||||
)
|
||||
|
||||
if config.is_encoder_decoder:
|
||||
hidden_states = outputs.decoder_hidden_states
|
||||
|
||||
self.asseretIsInstance(hidden_states, (list, tuple))
|
||||
self.assertEqual(len(hidden_states), expected_num_layers)
|
||||
seq_len = getattr(self.model_tester, "seq_length", None)
|
||||
decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
|
||||
|
||||
self.assertListEqual(
|
||||
list(hidden_states[-1].shape[-2:]),
|
||||
[decoder_seq_length, self.model_tester.hidden_size],
|
||||
)
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
# check that output_hidden_states also work using config
|
||||
del inputs_dict["output_hidden_states"]
|
||||
config.output_hidden_states = True
|
||||
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class.__name__ == "TFEfficientFormerForImageClassificationWithTeacher":
|
||||
del inputs_dict["labels"]
|
||||
|
||||
return inputs_dict
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
@unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
|
||||
def test_for_masked_image_modeling(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
|
||||
|
||||
def test_for_image_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in TF_EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = TFEfficientFormerModel.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
def test_attention_outputs(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.return_dict = True
|
||||
|
||||
seq_len = getattr(self.model_tester, "seq_length", None)
|
||||
encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
|
||||
encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
|
||||
chunk_length = getattr(self.model_tester, "chunk_length", None)
|
||||
|
||||
if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
|
||||
encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_attentions"] = True
|
||||
inputs_dict["output_hidden_states"] = False
|
||||
config.return_dict = True
|
||||
model = model_class(config)
|
||||
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
|
||||
attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
|
||||
self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
|
||||
|
||||
# check that output_attentions also work using config
|
||||
del inputs_dict["output_attentions"]
|
||||
config.output_attentions = True
|
||||
model = model_class(config)
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
|
||||
|
||||
attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
|
||||
self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
|
||||
|
||||
if chunk_length is not None:
|
||||
self.assertListEqual(
|
||||
list(attentions[0].shape[-4:]),
|
||||
[self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
|
||||
)
|
||||
else:
|
||||
self.assertListEqual(
|
||||
list(attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
|
||||
)
|
||||
|
||||
|
||||
# We will verify our results on an image of cute cats
|
||||
def prepare_img():
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
return image
|
||||
|
||||
|
||||
@require_tf
|
||||
@require_vision
|
||||
class EfficientFormerModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_image_processor(self):
|
||||
return (
|
||||
EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head(self):
|
||||
model = TFEfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300")
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
# forward pass
|
||||
outputs = model(**inputs, training=False)
|
||||
# verify the logits
|
||||
expected_shape = tf.TensorShape((1, 1000))
|
||||
self.assertEqual(outputs.logits.shape, expected_shape)
|
||||
expected_slice = tf.constant([-0.0555, 0.4825, -0.0852])
|
||||
self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head_with_teacher(self):
|
||||
model = TFEfficientFormerForImageClassificationWithTeacher.from_pretrained(
|
||||
"snap-research/efficientformer-l1-300"
|
||||
)
|
||||
image_processor = self.default_image_processor
|
||||
image = prepare_img()
|
||||
inputs = image_processor(images=image, return_tensors="tf")
|
||||
# forward pass
|
||||
outputs = model(**inputs, training=False)
|
||||
# verify the logits
|
||||
expected_shape = tf.TensorShape((1, 1000))
|
||||
self.assertEqual(outputs.logits.shape, expected_shape)
|
||||
expected_slice = tf.constant([-0.1312, 0.4353, -1.0499])
|
||||
self.assertTrue(np.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
|
@ -82,6 +82,7 @@ src/transformers/models/dpt/modeling_dpt.py
|
||||
src/transformers/models/electra/configuration_electra.py
|
||||
src/transformers/models/electra/modeling_electra.py
|
||||
src/transformers/models/electra/modeling_tf_electra.py
|
||||
src/transformers/models/efficientformer/modeling_tf_efficientformer.py
|
||||
src/transformers/models/ernie/configuration_ernie.py
|
||||
src/transformers/models/ernie_m/configuration_ernie_m.py
|
||||
src/transformers/models/ernie_m/modeling_ernie_m.py
|
||||
|
Loading…
Reference in New Issue
Block a user