mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
init
This commit is contained in:
parent
f834d368f6
commit
672d7a0f3f
@ -463,6 +463,8 @@
|
||||
title: FLAN-UL2
|
||||
- local: model_doc/flaubert
|
||||
title: FlauBERT
|
||||
- local: model_doc/florence2
|
||||
title: Florence2
|
||||
- local: model_doc/fnet
|
||||
title: FNet
|
||||
- local: model_doc/fsmt
|
||||
|
67
docs/source/en/model_doc/florence2.md
Normal file
67
docs/source/en/model_doc/florence2.md
Normal file
@ -0,0 +1,67 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
|
||||
rendered properly in your Markdown viewer.
|
||||
|
||||
-->
|
||||
|
||||
# Florence2
|
||||
|
||||
## Overview
|
||||
|
||||
The Florence2 model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
|
||||
<INSERT SHORT SUMMARY HERE>
|
||||
|
||||
The abstract from the paper is the following:
|
||||
|
||||
*<INSERT PAPER ABSTRACT HERE>*
|
||||
|
||||
Tips:
|
||||
|
||||
<INSERT TIPS ABOUT MODEL HERE>
|
||||
|
||||
This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
|
||||
The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
|
||||
|
||||
|
||||
## Florence2Config
|
||||
|
||||
[[autodoc]] Florence2Config
|
||||
- all
|
||||
|
||||
## Florence2Model
|
||||
|
||||
[[autodoc]] Florence2Model
|
||||
- forward
|
||||
|
||||
## Florence2ForConditionalGeneration
|
||||
|
||||
[[autodoc]] Florence2ForConditionalGeneration
|
||||
- forward
|
||||
|
||||
## Florence2ForSequenceClassification
|
||||
|
||||
[[autodoc]] Florence2ForSequenceClassification
|
||||
- forward
|
||||
|
||||
## Florence2ForQuestionAnswering
|
||||
|
||||
[[autodoc]] Florence2ForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## Florence2ForCausalLM
|
||||
|
||||
[[autodoc]] Florence2ForCausalLM
|
||||
- forward
|
||||
|
||||
</pt>
|
||||
<tf>
|
@ -43,6 +43,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
|
||||
("bamba", "BambaConfig"),
|
||||
("bark", "BarkConfig"),
|
||||
("bart", "BartConfig"),
|
||||
("florence2", "Florence2Config"),
|
||||
("beit", "BeitConfig"),
|
||||
("bert", "BertConfig"),
|
||||
("bert-generation", "BertGenerationConfig"),
|
||||
@ -394,6 +395,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
||||
("bamba", "Bamba"),
|
||||
("bark", "Bark"),
|
||||
("bart", "BART"),
|
||||
("florence2", "Florence2"),
|
||||
("barthez", "BARThez"),
|
||||
("bartpho", "BARTpho"),
|
||||
("beit", "BEiT"),
|
||||
|
@ -43,6 +43,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
||||
("bamba", "BambaModel"),
|
||||
("bark", "BarkModel"),
|
||||
("bart", "BartModel"),
|
||||
("florence2", "Florence2Model"),
|
||||
("beit", "BeitModel"),
|
||||
("bert", "BertModel"),
|
||||
("bert-generation", "BertGenerationEncoder"),
|
||||
@ -359,6 +360,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
|
||||
# Model for pre-training mapping
|
||||
("albert", "AlbertForPreTraining"),
|
||||
("bart", "BartForConditionalGeneration"),
|
||||
("florence2", "Florence2ForConditionalGeneration"),
|
||||
("bert", "BertForPreTraining"),
|
||||
("big_bird", "BigBirdForPreTraining"),
|
||||
("bloom", "BloomForCausalLM"),
|
||||
@ -446,6 +448,7 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
|
||||
# Model with LM heads mapping
|
||||
("albert", "AlbertForMaskedLM"),
|
||||
("bart", "BartForConditionalGeneration"),
|
||||
("florence2", "Florence2ForConditionalGeneration"),
|
||||
("bert", "BertForMaskedLM"),
|
||||
("big_bird", "BigBirdForMaskedLM"),
|
||||
("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
|
||||
@ -534,6 +537,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
("aria_text", "AriaTextForCausalLM"),
|
||||
("bamba", "BambaForCausalLM"),
|
||||
("bart", "BartForCausalLM"),
|
||||
("florence2", "Florence2ForCausalLM"),
|
||||
("bert", "BertLMHeadModel"),
|
||||
("bert-generation", "BertGenerationDecoder"),
|
||||
("big_bird", "BigBirdForCausalLM"),
|
||||
@ -918,6 +922,7 @@ MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
|
||||
# Model for Masked LM mapping
|
||||
("albert", "AlbertForMaskedLM"),
|
||||
("bart", "BartForConditionalGeneration"),
|
||||
("florence2", "Florence2ForConditionalGeneration"),
|
||||
("bert", "BertForMaskedLM"),
|
||||
("big_bird", "BigBirdForMaskedLM"),
|
||||
("camembert", "CamembertForMaskedLM"),
|
||||
@ -1006,6 +1011,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
|
||||
[
|
||||
# Model for Seq2Seq Causal LM mapping
|
||||
("bart", "BartForConditionalGeneration"),
|
||||
("florence2", "Florence2ForConditionalGeneration"),
|
||||
("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
|
||||
("blenderbot", "BlenderbotForConditionalGeneration"),
|
||||
("blenderbot-small", "BlenderbotSmallForConditionalGeneration"),
|
||||
@ -1054,6 +1060,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
||||
# Model for Sequence Classification mapping
|
||||
("albert", "AlbertForSequenceClassification"),
|
||||
("bart", "BartForSequenceClassification"),
|
||||
("florence2", "Florence2ForSequenceClassification"),
|
||||
("bert", "BertForSequenceClassification"),
|
||||
("big_bird", "BigBirdForSequenceClassification"),
|
||||
("bigbird_pegasus", "BigBirdPegasusForSequenceClassification"),
|
||||
@ -1158,6 +1165,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
|
||||
# Model for Question Answering mapping
|
||||
("albert", "AlbertForQuestionAnswering"),
|
||||
("bart", "BartForQuestionAnswering"),
|
||||
("florence2", "Florence2ForQuestionAnswering"),
|
||||
("bert", "BertForQuestionAnswering"),
|
||||
("big_bird", "BigBirdForQuestionAnswering"),
|
||||
("bigbird_pegasus", "BigBirdPegasusForQuestionAnswering"),
|
||||
|
@ -72,6 +72,7 @@ else:
|
||||
("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||
("bart", ("BartTokenizer", "BartTokenizerFast")),
|
||||
("florence2", ("BartTokenizer", "BartTokenizerFast")),
|
||||
(
|
||||
"barthez",
|
||||
(
|
||||
|
29
src/transformers/models/florence2/__init__.py
Normal file
29
src/transformers/models/florence2/__init__.py
Normal file
@ -0,0 +1,29 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...utils import _LazyModule
|
||||
from ...utils.import_utils import define_import_structure
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_florence2 import *
|
||||
from .modeling_florence2 import *
|
||||
from .modeling_flax_florence2 import *
|
||||
from .modeling_tf_florence2 import *
|
||||
else:
|
||||
import sys
|
||||
|
||||
_file = globals()["__file__"]
|
||||
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
|
404
src/transformers/models/florence2/configuration_florence2.py
Normal file
404
src/transformers/models/florence2/configuration_florence2.py
Normal file
@ -0,0 +1,404 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""FLORENCE2 model configuration"""
|
||||
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
from typing import Any, Mapping, Optional
|
||||
|
||||
from ... import PreTrainedTokenizer
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
|
||||
from ...onnx.utils import compute_effective_axis_dimension
|
||||
from ...utils import TensorType, is_torch_available, logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Florence2Config(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Florence2Model`]. It is used to instantiate a FLORENCE2
|
||||
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
||||
defaults will yield a similar configuration to that of the FLORENCE2
|
||||
[facebook/florence2-large](https://huggingface.co/facebook/florence2-large) architecture.
|
||||
|
||||
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
||||
documentation from [`PretrainedConfig`] for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 50265):
|
||||
Vocabulary size of the FLORENCE2 model. Defines the number of different tokens that can be represented by the
|
||||
`inputs_ids` passed when calling [`Florence2Model`] or [`TFFlorence2Model`].
|
||||
d_model (`int`, *optional*, defaults to 1024):
|
||||
Dimensionality of the layers and the pooler layer.
|
||||
encoder_layers (`int`, *optional*, defaults to 12):
|
||||
Number of encoder layers.
|
||||
decoder_layers (`int`, *optional*, defaults to 12):
|
||||
Number of decoder layers.
|
||||
encoder_attention_heads (`int`, *optional*, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
decoder_attention_heads (`int`, *optional*, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
encoder_ffn_dim (`int`, *optional*, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"silu"` and `"gelu_new"` are supported.
|
||||
dropout (`float`, *optional*, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
max_position_embeddings (`int`, *optional*, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
init_std (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
|
||||
for more details.
|
||||
decoder_layerdrop (`float`, *optional*, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
|
||||
for more details.
|
||||
scale_embedding (`bool`, *optional*, defaults to `False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
num_labels (`int`, *optional*, defaults to 3):
|
||||
The number of labels to use in [`Florence2ForSequenceClassification`].
|
||||
forced_eos_token_id (`int`, *optional*, defaults to 2):
|
||||
The id of the token to force as the last generated token when `max_length` is reached. Usually set to
|
||||
`eos_token_id`.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import Florence2Config, Florence2Model
|
||||
|
||||
>>> # Initializing a FLORENCE2 facebook/florence2-large style configuration
|
||||
>>> configuration = Florence2Config()
|
||||
|
||||
>>> # Initializing a model (with random weights) from the facebook/florence2-large style configuration
|
||||
>>> model = Florence2Model(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
|
||||
model_type = "florence2"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50265,
|
||||
max_position_embeddings=1024,
|
||||
encoder_layers=12,
|
||||
encoder_ffn_dim=4096,
|
||||
encoder_attention_heads=16,
|
||||
decoder_layers=12,
|
||||
decoder_ffn_dim=4096,
|
||||
decoder_attention_heads=16,
|
||||
encoder_layerdrop=0.0,
|
||||
decoder_layerdrop=0.0,
|
||||
activation_function="gelu",
|
||||
d_model=1024,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=False,
|
||||
use_cache=True,
|
||||
num_labels=3,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
is_encoder_decoder=True,
|
||||
decoder_start_token_id=2,
|
||||
forced_eos_token_id=2,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
self.encoder_layers = encoder_layers
|
||||
self.encoder_attention_heads = encoder_attention_heads
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
self.decoder_layers = decoder_layers
|
||||
self.decoder_attention_heads = decoder_attention_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
super().__init__(
|
||||
num_labels=num_labels,
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
forced_eos_token_id=forced_eos_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# ensure backward compatibility for FLORENCE2 CNN models
|
||||
if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
|
||||
self.forced_bos_token_id = self.bos_token_id
|
||||
warnings.warn(
|
||||
f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
|
||||
"The config can simply be saved and uploaded again to be fixed."
|
||||
)
|
||||
|
||||
|
||||
class Florence2OnnxConfig(OnnxSeq2SeqConfigWithPast):
|
||||
@property
|
||||
def inputs(self) -> Mapping[str, Mapping[int, str]]:
|
||||
if self.task in ["default", "seq2seq-lm"]:
|
||||
common_inputs = OrderedDict(
|
||||
[
|
||||
("input_ids", {0: "batch", 1: "encoder_sequence"}),
|
||||
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
|
||||
]
|
||||
)
|
||||
|
||||
if self.use_past:
|
||||
common_inputs["decoder_input_ids"] = {0: "batch"}
|
||||
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
|
||||
else:
|
||||
common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
|
||||
common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
|
||||
|
||||
if self.use_past:
|
||||
self.fill_with_past_key_values_(common_inputs, direction="inputs")
|
||||
elif self.task == "causal-lm":
|
||||
# TODO: figure this case out.
|
||||
common_inputs = OrderedDict(
|
||||
[
|
||||
("input_ids", {0: "batch", 1: "encoder_sequence"}),
|
||||
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
|
||||
]
|
||||
)
|
||||
if self.use_past:
|
||||
num_encoder_layers, _ = self.num_layers
|
||||
for i in range(num_encoder_layers):
|
||||
common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
|
||||
common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
|
||||
else:
|
||||
common_inputs = OrderedDict(
|
||||
[
|
||||
("input_ids", {0: "batch", 1: "encoder_sequence"}),
|
||||
("attention_mask", {0: "batch", 1: "encoder_sequence"}),
|
||||
("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
|
||||
("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
|
||||
]
|
||||
)
|
||||
|
||||
return common_inputs
|
||||
|
||||
@property
|
||||
def outputs(self) -> Mapping[str, Mapping[int, str]]:
|
||||
if self.task in ["default", "seq2seq-lm"]:
|
||||
common_outputs = super().outputs
|
||||
else:
|
||||
common_outputs = super(OnnxConfigWithPast, self).outputs
|
||||
if self.use_past:
|
||||
num_encoder_layers, _ = self.num_layers
|
||||
for i in range(num_encoder_layers):
|
||||
common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
|
||||
common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
|
||||
return common_outputs
|
||||
|
||||
def _generate_dummy_inputs_for_default_and_seq2seq_lm(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
batch_size: int = -1,
|
||||
seq_length: int = -1,
|
||||
is_pair: bool = False,
|
||||
framework: Optional[TensorType] = None,
|
||||
) -> Mapping[str, Any]:
|
||||
encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
|
||||
tokenizer, batch_size, seq_length, is_pair, framework
|
||||
)
|
||||
|
||||
# Generate decoder inputs
|
||||
decoder_seq_length = seq_length if not self.use_past else 1
|
||||
decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
|
||||
tokenizer, batch_size, decoder_seq_length, is_pair, framework
|
||||
)
|
||||
decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
|
||||
common_inputs = dict(**encoder_inputs, **decoder_inputs)
|
||||
|
||||
if self.use_past:
|
||||
if not is_torch_available():
|
||||
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
|
||||
else:
|
||||
import torch
|
||||
batch, encoder_seq_length = common_inputs["input_ids"].shape
|
||||
decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
|
||||
num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
|
||||
encoder_shape = (
|
||||
batch,
|
||||
num_encoder_attention_heads,
|
||||
encoder_seq_length,
|
||||
self._config.hidden_size // num_encoder_attention_heads,
|
||||
)
|
||||
decoder_past_length = decoder_seq_length + 3
|
||||
decoder_shape = (
|
||||
batch,
|
||||
num_decoder_attention_heads,
|
||||
decoder_past_length,
|
||||
self._config.hidden_size // num_decoder_attention_heads,
|
||||
)
|
||||
|
||||
common_inputs["decoder_attention_mask"] = torch.cat(
|
||||
[common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
|
||||
)
|
||||
|
||||
common_inputs["past_key_values"] = []
|
||||
# If the number of encoder and decoder layers are present in the model configuration, both are considered
|
||||
num_encoder_layers, num_decoder_layers = self.num_layers
|
||||
min_num_layers = min(num_encoder_layers, num_decoder_layers)
|
||||
max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
|
||||
remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
|
||||
|
||||
for _ in range(min_num_layers):
|
||||
common_inputs["past_key_values"].append(
|
||||
(
|
||||
torch.zeros(decoder_shape),
|
||||
torch.zeros(decoder_shape),
|
||||
torch.zeros(encoder_shape),
|
||||
torch.zeros(encoder_shape),
|
||||
)
|
||||
)
|
||||
# TODO: test this.
|
||||
shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
|
||||
for _ in range(min_num_layers, max_num_layers):
|
||||
common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
|
||||
return common_inputs
|
||||
|
||||
def _generate_dummy_inputs_for_causal_lm(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
batch_size: int = -1,
|
||||
seq_length: int = -1,
|
||||
is_pair: bool = False,
|
||||
framework: Optional[TensorType] = None,
|
||||
) -> Mapping[str, Any]:
|
||||
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
|
||||
tokenizer, batch_size, seq_length, is_pair, framework
|
||||
)
|
||||
|
||||
if self.use_past:
|
||||
if not is_torch_available():
|
||||
raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
|
||||
else:
|
||||
import torch
|
||||
batch, seqlen = common_inputs["input_ids"].shape
|
||||
# Not using the same length for past_key_values
|
||||
past_key_values_length = seqlen + 2
|
||||
num_encoder_layers, _ = self.num_layers
|
||||
num_encoder_attention_heads, _ = self.num_attention_heads
|
||||
past_shape = (
|
||||
batch,
|
||||
num_encoder_attention_heads,
|
||||
past_key_values_length,
|
||||
self._config.hidden_size // num_encoder_attention_heads,
|
||||
)
|
||||
|
||||
mask_dtype = common_inputs["attention_mask"].dtype
|
||||
common_inputs["attention_mask"] = torch.cat(
|
||||
[common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
|
||||
)
|
||||
common_inputs["past_key_values"] = [
|
||||
(torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
|
||||
]
|
||||
return common_inputs
|
||||
|
||||
def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
batch_size: int = -1,
|
||||
seq_length: int = -1,
|
||||
is_pair: bool = False,
|
||||
framework: Optional[TensorType] = None,
|
||||
) -> Mapping[str, Any]:
|
||||
# Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity.
|
||||
# If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
|
||||
batch_size = compute_effective_axis_dimension(
|
||||
batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
|
||||
)
|
||||
|
||||
# If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
|
||||
token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
|
||||
seq_length = compute_effective_axis_dimension(
|
||||
seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
|
||||
)
|
||||
|
||||
# Generate dummy inputs according to compute batch and sequence
|
||||
dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
|
||||
common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
|
||||
return common_inputs
|
||||
|
||||
def generate_dummy_inputs(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
batch_size: int = -1,
|
||||
seq_length: int = -1,
|
||||
is_pair: bool = False,
|
||||
framework: Optional[TensorType] = None,
|
||||
) -> Mapping[str, Any]:
|
||||
if self.task in ["default", "seq2seq-lm"]:
|
||||
common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
|
||||
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
|
||||
)
|
||||
|
||||
elif self.task == "causal-lm":
|
||||
common_inputs = self._generate_dummy_inputs_for_causal_lm(
|
||||
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
|
||||
)
|
||||
else:
|
||||
common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
|
||||
tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
|
||||
)
|
||||
|
||||
return common_inputs
|
||||
|
||||
def _flatten_past_key_values_(self, flattened_output, name, idx, t):
|
||||
if self.task in ["default", "seq2seq-lm"]:
|
||||
flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
|
||||
else:
|
||||
flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
|
||||
flattened_output, name, idx, t
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Florence2Config", "Florence2OnnxConfig"]
|
@ -0,0 +1,156 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert FLORENCE2 checkpoint."""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import fairseq
|
||||
import torch
|
||||
from packaging import version
|
||||
from torch import nn
|
||||
|
||||
from transformers import (
|
||||
Florence2Config,
|
||||
Florence2ForConditionalGeneration,
|
||||
Florence2ForSequenceClassification,
|
||||
Florence2Model,
|
||||
BartTokenizer,
|
||||
)
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
FAIRSEQ_MODELS = ["florence2.large", "florence2.large.mnli", "florence2.large.cnn", "florence2_xsum/model.pt"]
|
||||
extra_arch = {"florence2.large": Florence2Model, "florence2.large.mnli": Florence2ForSequenceClassification}
|
||||
if version.parse(fairseq.__version__) < version.parse("0.9.0"):
|
||||
raise Exception("requires fairseq >= 0.9.0")
|
||||
|
||||
|
||||
logging.set_verbosity_info()
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
SAMPLE_TEXT = " Hello world! cécé herlolip"
|
||||
|
||||
mnli_rename_keys = [
|
||||
("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
|
||||
("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
|
||||
("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
|
||||
("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
|
||||
]
|
||||
|
||||
|
||||
def remove_ignore_keys_(state_dict):
|
||||
ignore_keys = [
|
||||
"encoder.version",
|
||||
"decoder.version",
|
||||
"model.encoder.version",
|
||||
"model.decoder.version",
|
||||
"_float_tensor",
|
||||
]
|
||||
for k in ignore_keys:
|
||||
state_dict.pop(k, None)
|
||||
|
||||
|
||||
def rename_key(dct, old, new):
|
||||
val = dct.pop(old)
|
||||
dct[new] = val
|
||||
|
||||
|
||||
def load_xsum_checkpoint(checkpoint_path):
|
||||
"""Checkpoint path should end in model.pt"""
|
||||
sd = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
|
||||
hub_interface = torch.hub.load("pytorch/fairseq", "florence2.large.cnn").eval()
|
||||
hub_interface.model.load_state_dict(sd["model"])
|
||||
return hub_interface
|
||||
|
||||
|
||||
def make_linear_from_emb(emb):
|
||||
vocab_size, emb_size = emb.weight.shape
|
||||
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
|
||||
lin_layer.weight.data = emb.weight.data
|
||||
return lin_layer
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def convert_florence2_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
|
||||
"""
|
||||
Copy/paste/tweak model's weights to our BERT structure.
|
||||
"""
|
||||
if not os.path.exists(checkpoint_path):
|
||||
florence2 = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
|
||||
else:
|
||||
florence2 = load_xsum_checkpoint(checkpoint_path)
|
||||
|
||||
florence2.model.upgrade_state_dict(florence2.model.state_dict())
|
||||
if hf_checkpoint_name is None:
|
||||
hf_checkpoint_name = checkpoint_path.replace(".", "-")
|
||||
config = Florence2Config.from_pretrained(hf_checkpoint_name)
|
||||
tokens = florence2.encode(SAMPLE_TEXT).unsqueeze(0)
|
||||
tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
|
||||
if not torch.eq(tokens, tokens2).all():
|
||||
raise ValueError(
|
||||
f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}"
|
||||
)
|
||||
|
||||
if checkpoint_path == "florence2.large.mnli":
|
||||
state_dict = florence2.state_dict()
|
||||
remove_ignore_keys_(state_dict)
|
||||
state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
|
||||
for src, dest in mnli_rename_keys:
|
||||
rename_key(state_dict, src, dest)
|
||||
model = Florence2ForSequenceClassification(config).eval()
|
||||
model.load_state_dict(state_dict)
|
||||
fairseq_output = florence2.predict("mnli", tokens, return_logits=True)
|
||||
new_model_outputs = model(tokens)[0] # logits
|
||||
else: # no classification heads to worry about
|
||||
state_dict = florence2.model.state_dict()
|
||||
remove_ignore_keys_(state_dict)
|
||||
state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
|
||||
fairseq_output = florence2.extract_features(tokens)
|
||||
if hf_checkpoint_name == "facebook/florence2-large":
|
||||
model = Florence2Model(config).eval()
|
||||
model.load_state_dict(state_dict)
|
||||
new_model_outputs = model(tokens).model[0]
|
||||
else:
|
||||
model = Florence2ForConditionalGeneration(config).eval() # an existing summarization ckpt
|
||||
model.model.load_state_dict(state_dict)
|
||||
if hasattr(model, "lm_head"):
|
||||
model.lm_head = make_linear_from_emb(model.model.shared)
|
||||
new_model_outputs = model.model(tokens)[0]
|
||||
|
||||
# Check results
|
||||
if fairseq_output.shape != new_model_outputs.shape:
|
||||
raise ValueError(
|
||||
f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}"
|
||||
)
|
||||
if (fairseq_output != new_model_outputs).any().item():
|
||||
raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`")
|
||||
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
|
||||
model.save_pretrained(pytorch_dump_folder_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"fairseq_path", type=str, help="florence2.large, florence2.large.cnn or a path to a model.pt on local filesystem."
|
||||
)
|
||||
parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
|
||||
parser.add_argument(
|
||||
"--hf_config", default=None, type=str, help="Which huggingface architecture to use: florence2-large-xsum"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_florence2_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config)
|
2216
src/transformers/models/florence2/modeling_florence2.py
Normal file
2216
src/transformers/models/florence2/modeling_florence2.py
Normal file
File diff suppressed because it is too large
Load Diff
0
tests/models/florence2/__init__.py
Normal file
0
tests/models/florence2/__init__.py
Normal file
1508
tests/models/florence2/test_modeling_florence2.py
Normal file
1508
tests/models/florence2/test_modeling_florence2.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user