This commit is contained in:
Duc-Viet Hoang 2025-05-19 21:42:31 +07:00
parent 96761e9366
commit ef738f3902
8 changed files with 202 additions and 261 deletions

View File

@ -43,7 +43,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("bamba", "BambaConfig"),
("bark", "BarkConfig"),
("bart", "BartConfig"),
("florence2", "Florence2Config"),
("beit", "BeitConfig"),
("bert", "BertConfig"),
("bert-generation", "BertGenerationConfig"),
@ -123,6 +122,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("fastspeech2_conformer", "FastSpeech2ConformerConfig"),
("flaubert", "FlaubertConfig"),
("flava", "FlavaConfig"),
("florence2", "Florence2Config"),
("fnet", "FNetConfig"),
("focalnet", "FocalNetConfig"),
("fsmt", "FSMTConfig"),
@ -395,7 +395,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
("bamba", "Bamba"),
("bark", "Bark"),
("bart", "BART"),
("florence2", "Florence2"),
("barthez", "BARThez"),
("bartpho", "BARTpho"),
("beit", "BEiT"),
@ -489,6 +488,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("flan-ul2", "FLAN-UL2"),
("flaubert", "FlauBERT"),
("flava", "FLAVA"),
("florence2", "Florence2"),
("fnet", "FNet"),
("focalnet", "FocalNet"),
("fsmt", "FairSeq Machine-Translation"),

View File

@ -43,7 +43,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
("bamba", "BambaModel"),
("bark", "BarkModel"),
("bart", "BartModel"),
("florence2", "Florence2Model"),
("beit", "BeitModel"),
("bert", "BertModel"),
("bert-generation", "BertGenerationEncoder"),
@ -120,6 +119,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("fastspeech2_conformer", "FastSpeech2ConformerModel"),
("flaubert", "FlaubertModel"),
("flava", "FlavaModel"),
("florence2", "Florence2Model"),
("fnet", "FNetModel"),
("focalnet", "FocalNetModel"),
("fsmt", "FSMTModel"),
@ -360,7 +360,6 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
# Model for pre-training mapping
("albert", "AlbertForPreTraining"),
("bart", "BartForConditionalGeneration"),
("florence2", "Florence2ForConditionalGeneration"),
("bert", "BertForPreTraining"),
("big_bird", "BigBirdForPreTraining"),
("bloom", "BloomForCausalLM"),
@ -376,6 +375,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("falcon_mamba", "FalconMambaForCausalLM"),
("flaubert", "FlaubertWithLMHeadModel"),
("flava", "FlavaForPreTraining"),
("florence2", "Florence2ForConditionalGeneration"),
("fnet", "FNetForPreTraining"),
("fsmt", "FSMTForConditionalGeneration"),
("funnel", "FunnelForPreTraining"),
@ -448,7 +448,6 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
# Model with LM heads mapping
("albert", "AlbertForMaskedLM"),
("bart", "BartForConditionalGeneration"),
("florence2", "Florence2ForConditionalGeneration"),
("bert", "BertForMaskedLM"),
("big_bird", "BigBirdForMaskedLM"),
("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
@ -469,6 +468,7 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
("esm", "EsmForMaskedLM"),
("falcon_mamba", "FalconMambaForCausalLM"),
("flaubert", "FlaubertWithLMHeadModel"),
("florence2", "Florence2ForConditionalGeneration"),
("fnet", "FNetForMaskedLM"),
("fsmt", "FSMTForConditionalGeneration"),
("funnel", "FunnelForMaskedLM"),
@ -537,7 +537,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("aria_text", "AriaTextForCausalLM"),
("bamba", "BambaForCausalLM"),
("bart", "BartForCausalLM"),
("florence2", "Florence2ForCausalLM"),
("bert", "BertLMHeadModel"),
("bert-generation", "BertGenerationDecoder"),
("big_bird", "BigBirdForCausalLM"),
@ -563,6 +562,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("ernie", "ErnieForCausalLM"),
("falcon", "FalconForCausalLM"),
("falcon_mamba", "FalconMambaForCausalLM"),
("florence2", "Florence2ForCausalLM"),
("fuyu", "FuyuForCausalLM"),
("gemma", "GemmaForCausalLM"),
("gemma2", "Gemma2ForCausalLM"),
@ -922,7 +922,6 @@ MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
# Model for Masked LM mapping
("albert", "AlbertForMaskedLM"),
("bart", "BartForConditionalGeneration"),
("florence2", "Florence2ForConditionalGeneration"),
("bert", "BertForMaskedLM"),
("big_bird", "BigBirdForMaskedLM"),
("camembert", "CamembertForMaskedLM"),
@ -935,6 +934,7 @@ MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
("ernie", "ErnieForMaskedLM"),
("esm", "EsmForMaskedLM"),
("flaubert", "FlaubertWithLMHeadModel"),
("florence2", "Florence2ForConditionalGeneration"),
("fnet", "FNetForMaskedLM"),
("funnel", "FunnelForMaskedLM"),
("ibert", "IBertForMaskedLM"),
@ -1011,11 +1011,11 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
[
# Model for Seq2Seq Causal LM mapping
("bart", "BartForConditionalGeneration"),
("florence2", "Florence2ForConditionalGeneration"),
("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
("blenderbot", "BlenderbotForConditionalGeneration"),
("blenderbot-small", "BlenderbotSmallForConditionalGeneration"),
("encoder-decoder", "EncoderDecoderModel"),
("florence2", "Florence2ForConditionalGeneration"),
("fsmt", "FSMTForConditionalGeneration"),
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
("granite_speech", "GraniteSpeechForConditionalGeneration"),
@ -1060,7 +1060,6 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
# Model for Sequence Classification mapping
("albert", "AlbertForSequenceClassification"),
("bart", "BartForSequenceClassification"),
("florence2", "Florence2ForSequenceClassification"),
("bert", "BertForSequenceClassification"),
("big_bird", "BigBirdForSequenceClassification"),
("bigbird_pegasus", "BigBirdPegasusForSequenceClassification"),
@ -1082,6 +1081,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("esm", "EsmForSequenceClassification"),
("falcon", "FalconForSequenceClassification"),
("flaubert", "FlaubertForSequenceClassification"),
("florence2", "Florence2ForSequenceClassification"),
("fnet", "FNetForSequenceClassification"),
("funnel", "FunnelForSequenceClassification"),
("gemma", "GemmaForSequenceClassification"),
@ -1165,7 +1165,6 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
# Model for Question Answering mapping
("albert", "AlbertForQuestionAnswering"),
("bart", "BartForQuestionAnswering"),
("florence2", "Florence2ForQuestionAnswering"),
("bert", "BertForQuestionAnswering"),
("big_bird", "BigBirdForQuestionAnswering"),
("bigbird_pegasus", "BigBirdPegasusForQuestionAnswering"),
@ -1183,6 +1182,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
("ernie_m", "ErnieMForQuestionAnswering"),
("falcon", "FalconForQuestionAnswering"),
("flaubert", "FlaubertForQuestionAnsweringSimple"),
("florence2", "Florence2ForQuestionAnswering"),
("fnet", "FNetForQuestionAnswering"),
("funnel", "FunnelForQuestionAnswering"),
("gpt2", "GPT2ForQuestionAnswering"),

View File

@ -72,7 +72,6 @@ else:
("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
("bart", ("BartTokenizer", "BartTokenizerFast")),
("florence2", ("BartTokenizer", "BartTokenizerFast")),
(
"barthez",
(
@ -207,6 +206,7 @@ else:
("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None),
),
("flaubert", ("FlaubertTokenizer", None)),
("florence2", ("BartTokenizer", "BartTokenizerFast")),
("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
("fsmt", ("FSMTTokenizer", None)),
("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),

View File

@ -20,8 +20,6 @@ from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_florence2 import *
from .modeling_florence2 import *
from .modeling_flax_florence2 import *
from .modeling_tf_florence2 import *
else:
import sys

View File

@ -1,6 +1,5 @@
# coding=utf-8
# Copyright 2025 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -15,34 +14,125 @@
"""FLORENCE2 model configuration"""
import warnings
from collections import OrderedDict
from typing import Any, Mapping, Optional
from ... import PreTrainedTokenizer
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
from ...onnx.utils import compute_effective_axis_dimension
from ...utils import TensorType, is_torch_available, logging
from ...utils import logging
logger = logging.get_logger(__name__)
class Florence2Config(PretrainedConfig):
class Florence2VisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Florence2Model`]. It is used to instantiate a FLORENCE2
This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
drop_path_rate (`float`, *optional*, defaults to 0.1):
The dropout rate of the drop path layer.
patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
The patch size of the image.
patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
The patch stride of the image.
patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
The patch padding of the image.
patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
Whether to apply layer normalization before the patch embedding layer.
enable_checkpoint (`bool`, *optional*, defaults to False):
Whether to enable checkpointing.
dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
The dimension of the embedding layer.
num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
The number of attention heads.
num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
The number of groups.
depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
The depth of the model.
window_size (`int`, *optional*, defaults to 12):
The window size of the model.
projection_dim (`int`, *optional*, defaults to 1024):
The dimension of the projection layer.
visual_temporal_embedding (`dict`, *optional*):
The configuration of the visual temporal embedding.
image_pos_embed (`dict`, *optional*):
The configuration of the image position embedding.
image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
The source of the image feature.
Example:
```python
>>> from transformers import Florence2VisionConfig, Florence2VisionModel
>>> # Initializing a Florence2 Vision style configuration
>>> configuration = Florence2VisionConfig()
>>> # Initializing a model (with random weights)
>>> model = Florence2VisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "florence2_vision"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
drop_path_rate=0.1,
patch_size=[7, 3, 3, 3],
patch_stride=[4, 2, 2, 2],
patch_padding=[3, 1, 1, 1],
patch_prenorm=[False, True, True, True],
enable_checkpoint=False,
dim_embed=[256, 512, 1024, 2048],
num_heads=[8, 16, 32, 64],
num_groups=[8, 16, 32, 64],
depths=[1, 1, 9, 1],
window_size=12,
projection_dim=1024,
visual_temporal_embedding=None,
image_pos_embed=None,
image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
**kwargs,
):
self.drop_path_rate = drop_path_rate
self.patch_size = patch_size
self.patch_stride = patch_stride
self.patch_padding = patch_padding
self.patch_prenorm = patch_prenorm
self.enable_checkpoint = enable_checkpoint
self.dim_embed = dim_embed
self.num_heads = num_heads
self.num_groups = num_groups
self.depths = depths
self.window_size = window_size
self.projection_dim = projection_dim
self.visual_temporal_embedding = visual_temporal_embedding
self.image_pos_embed = image_pos_embed
self.image_feature_source = image_feature_source
super().__init__(**kwargs)
class Florence2LanguageConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the FLORENCE2
[facebook/florence2-large](https://huggingface.co/facebook/florence2-large) architecture.
defaults will yield a similar configuration to that of the BART
[facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the FLORENCE2 model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Florence2Model`] or [`TFFlorence2Model`].
vocab_size (`int`, *optional*, defaults to 51289):
Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`Florence2LanguageModel`].
d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
encoder_layers (`int`, *optional*, defaults to 12):
@ -84,7 +174,7 @@ class Florence2Config(PretrainedConfig):
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
num_labels (`int`, *optional*, defaults to 3):
The number of labels to use in [`Florence2ForSequenceClassification`].
The number of labels to use in [`Florence2LanguageForSequenceClassification`].
forced_eos_token_id (`int`, *optional*, defaults to 2):
The id of the token to force as the last generated token when `max_length` is reached. Usually set to
`eos_token_id`.
@ -92,25 +182,25 @@ class Florence2Config(PretrainedConfig):
Example:
```python
>>> from transformers import Florence2Config, Florence2Model
>>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
>>> # Initializing a FLORENCE2 facebook/florence2-large style configuration
>>> configuration = Florence2Config()
>>> # Initializing a Florence2 Language style configuration
>>> configuration = Florence2LanguageConfig()
>>> # Initializing a model (with random weights) from the facebook/florence2-large style configuration
>>> model = Florence2Model(configuration)
>>> # Initializing a model (with random weights)
>>> model = Florence2LangaugeModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "florence2"
model_type = "florence2_language"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=50265,
vocab_size=51289,
max_position_embeddings=1024,
encoder_layers=12,
encoder_ffn_dim=4096,
@ -170,7 +260,7 @@ class Florence2Config(PretrainedConfig):
**kwargs,
)
# ensure backward compatibility for FLORENCE2 CNN models
# ensure backward compatibility for BART CNN models
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
self.forced_bos_token_id = self.bos_token_id
warnings.warn(
@ -179,226 +269,70 @@ class Florence2Config(PretrainedConfig):
)
class Florence2OnnxConfig(OnnxSeq2SeqConfigWithPast):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["default", "seq2seq-lm"]:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
class Florence2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
Florence-2 model according to the specified arguments, defining the model architecture.
if self.use_past:
common_inputs["decoder_input_ids"] = {0: "batch"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
else:
common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
if self.use_past:
self.fill_with_past_key_values_(common_inputs, direction="inputs")
elif self.task == "causal-lm":
# TODO: figure this case out.
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
]
)
if self.use_past:
num_encoder_layers, _ = self.num_layers
for i in range(num_encoder_layers):
common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
else:
common_inputs = OrderedDict(
[
("input_ids", {0: "batch", 1: "encoder_sequence"}),
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
]
)
Args:
vision_config (`Florence2VisionConfig`, *optional*):
Custom vision config or dict
text_config (`Union[AutoConfig, dict]`, *optional*):
The config object of the text backbone.
ignore_index (`int`, *optional*, defaults to -100):
The ignore index for the loss function.
vocab_size (`int`, *optional*, defaults to 51289):
Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
projection_dim (`int`, *optional*, defaults to 1024):
Dimension of the multimodal projection space.
return common_inputs
Example:
@property
def outputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task in ["default", "seq2seq-lm"]:
common_outputs = super().outputs
else:
common_outputs = super(OnnxConfigWithPast, self).outputs
if self.use_past:
num_encoder_layers, _ = self.num_layers
for i in range(num_encoder_layers):
common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
return common_outputs
```python
>>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
def _generate_dummy_inputs_for_default_and_seq2seq_lm(
>>> # Initializing a clip-like vision config
>>> vision_config = CLIPVisionConfig()
>>> # Initializing a Bart config
>>> text_config = BartConfig()
>>> # Initializing a Florence-2 configuration
>>> configuration = Florence2Config(vision_config, text_config)
>>> # Initializing a model from the florence-2 configuration
>>> model = Florence2ForConditionalGeneration(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "florence2"
is_composition = False
def __init__(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
vision_config=None,
text_config=None,
ignore_index=-100,
vocab_size=51289,
projection_dim=1024,
**kwargs,
):
self.ignore_index = ignore_index
self.vocab_size = vocab_size
self.projection_dim = projection_dim
if vision_config is not None:
vision_config = PretrainedConfig(**vision_config)
self.vision_config = vision_config
self.vocab_size = self.vocab_size
# Generate decoder inputs
decoder_seq_length = seq_length if not self.use_past else 1
decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, decoder_seq_length, is_pair, framework
)
decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
common_inputs = dict(**encoder_inputs, **decoder_inputs)
self.text_config = text_config
if text_config is not None:
self.text_config = Florence2LanguageConfig(**text_config)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, encoder_seq_length = common_inputs["input_ids"].shape
decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
encoder_shape = (
batch,
num_encoder_attention_heads,
encoder_seq_length,
self._config.hidden_size // num_encoder_attention_heads,
)
decoder_past_length = decoder_seq_length + 3
decoder_shape = (
batch,
num_decoder_attention_heads,
decoder_past_length,
self._config.hidden_size // num_decoder_attention_heads,
)
common_inputs["decoder_attention_mask"] = torch.cat(
[common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
)
common_inputs["past_key_values"] = []
# If the number of encoder and decoder layers are present in the model configuration, both are considered
num_encoder_layers, num_decoder_layers = self.num_layers
min_num_layers = min(num_encoder_layers, num_decoder_layers)
max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
for _ in range(min_num_layers):
common_inputs["past_key_values"].append(
(
torch.zeros(decoder_shape),
torch.zeros(decoder_shape),
torch.zeros(encoder_shape),
torch.zeros(encoder_shape),
)
)
# TODO: test this.
shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
for _ in range(min_num_layers, max_num_layers):
common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
return common_inputs
def _generate_dummy_inputs_for_causal_lm(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size, seq_length, is_pair, framework
)
if self.use_past:
if not is_torch_available():
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
else:
import torch
batch, seqlen = common_inputs["input_ids"].shape
# Not using the same length for past_key_values
past_key_values_length = seqlen + 2
num_encoder_layers, _ = self.num_layers
num_encoder_attention_heads, _ = self.num_attention_heads
past_shape = (
batch,
num_encoder_attention_heads,
past_key_values_length,
self._config.hidden_size // num_encoder_attention_heads,
)
mask_dtype = common_inputs["attention_mask"].dtype
common_inputs["attention_mask"] = torch.cat(
[common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
)
common_inputs["past_key_values"] = [
(torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
]
return common_inputs
def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
# Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity.
# If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
batch_size = compute_effective_axis_dimension(
batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
)
# If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
seq_length = compute_effective_axis_dimension(
seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
)
# Generate dummy inputs according to compute batch and sequence
dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
return common_inputs
def generate_dummy_inputs(
self,
tokenizer: PreTrainedTokenizer,
batch_size: int = -1,
seq_length: int = -1,
is_pair: bool = False,
framework: Optional[TensorType] = None,
) -> Mapping[str, Any]:
if self.task in ["default", "seq2seq-lm"]:
common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
elif self.task == "causal-lm":
common_inputs = self._generate_dummy_inputs_for_causal_lm(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
else:
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
)
return common_inputs
def _flatten_past_key_values_(self, flattened_output, name, idx, t):
if self.task in ["default", "seq2seq-lm"]:
flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
else:
flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
flattened_output, name, idx, t
)
__all__ = ["Florence2Config", "Florence2OnnxConfig"]
super().__init__(**kwargs)

View File

@ -24,11 +24,11 @@ from packaging import version
from torch import nn
from transformers import (
BartTokenizer,
Florence2Config,
Florence2ForConditionalGeneration,
Florence2ForSequenceClassification,
Florence2Model,
BartTokenizer,
)
from transformers.utils import logging
@ -146,7 +146,9 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"fairseq_path", type=str, help="florence2.large, florence2.large.cnn or a path to a model.pt on local filesystem."
"fairseq_path",
type=str,
help="florence2.large, florence2.large.cnn or a path to a model.pt on local filesystem.",
)
parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
parser.add_argument(

View File

@ -75,7 +75,7 @@ def _trunc_normal_(tensor, mean, std, a, b):
def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
# type: (Tensor, float, float, float, float) -> Tensor
# type: (torch.Tensor, float, float, float, float) -> torch.Tensor
r"""Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`

View File

@ -41,12 +41,12 @@ if is_torch_available():
from transformers import (
AutoModelForSequenceClassification,
BartTokenizer,
Florence2ForCausalLM,
Florence2ForConditionalGeneration,
Florence2ForQuestionAnswering,
Florence2ForSequenceClassification,
Florence2Model,
BartTokenizer,
pipeline,
)
from transformers.models.florence2.modeling_florence2 import Florence2Decoder, Florence2Encoder, shift_tokens_right
@ -401,7 +401,12 @@ class Florence2HeadTests(unittest.TestCase):
@require_torch
class Florence2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (
(Florence2Model, Florence2ForConditionalGeneration, Florence2ForSequenceClassification, Florence2ForQuestionAnswering)
(
Florence2Model,
Florence2ForConditionalGeneration,
Florence2ForSequenceClassification,
Florence2ForQuestionAnswering,
)
if is_torch_available()
else ()
)
@ -1218,7 +1223,9 @@ class Florence2ModelIntegrationTests(unittest.TestCase):
" up to four years in prison. Her next court appearance is scheduled for May 18."
)
florence2_tokenizer = BartTokenizer.from_pretrained("facebook/florence2-large-cnn")
florence2_model = Florence2ForConditionalGeneration.from_pretrained("facebook/florence2-large-cnn").to(torch_device)
florence2_model = Florence2ForConditionalGeneration.from_pretrained("facebook/florence2-large-cnn").to(
torch_device
)
input_ids = florence2_tokenizer(
article, add_special_tokens=False, truncation=True, max_length=512, return_tensors="pt"
).input_ids.to(torch_device)
@ -1238,9 +1245,9 @@ class Florence2ModelIntegrationTests(unittest.TestCase):
@slow
def test_decoder_attention_mask(self):
model = Florence2ForConditionalGeneration.from_pretrained("facebook/florence2-large", forced_bos_token_id=0).to(
torch_device
)
model = Florence2ForConditionalGeneration.from_pretrained(
"facebook/florence2-large", forced_bos_token_id=0
).to(torch_device)
tokenizer = self.default_tokenizer
sentence = "UN Chief Says There Is No <mask> in Syria"
input_ids = tokenizer(sentence, return_tensors="pt").input_ids.to(torch_device)