This commit is contained in:
Duc-Viet Hoang 2025-05-19 23:55:03 +07:00
parent dc245e76db
commit 8e7aa374cf
9 changed files with 511 additions and 2767 deletions

View File

@ -1,4 +1,4 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
@ -18,50 +18,51 @@ rendered properly in your Markdown viewer.
## Overview
The Florence2 model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
<INSERT SHORT SUMMARY HERE>
The Florence2 model was proposed in [Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks](https://arxiv.org/abs/2311.06242) by Microsoft.
Florence-2 is an advanced vision foundation model that uses a prompt-based approach to handle a wide range of vision and vision-language tasks. Florence-2 can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation. It leverages our FLD-5B dataset, containing 5.4 billion annotations across 126 million images, to master multi-task learning. The model's sequence-to-sequence architecture enables it to excel in both zero-shot and fine-tuned settings, proving to be a competitive vision foundation model.
The abstract from the paper is the following:
*<INSERT PAPER ABSTRACT HERE>*
Tips:
<INSERT TIPS ABOUT MODEL HERE>
This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
*We introduce Florence-2, a novel vision foundation model with a unified, prompt-based representation for a variety of computer vision and vision-language tasks. While existing large vision models excel in transfer learning, they struggle to perform a diversity of tasks with simple instructions, a capability that implies handling the complexity of various spatial hierarchy and semantic granularity. Florence-2 was designed to take text-prompt as task instructions and generate desirable results in text forms, whether it be captioning, object detection, grounding or segmentation. This multi-task learning setup demands large-scale, high-quality annotated data. To this end, we co-developed FLD-5B that consists of 5.4 billion comprehensive visual annotations on 126 million images, using an iterative strategy of automated image annotation and model refinement. We adopted a sequence-to-sequence structure to train Florence-2 to perform versatile and comprehensive vision tasks. Extensive evaluations on numerous tasks demonstrated Florence-2 to be a strong vision foundation model contender with unprecedented zero-shot and fine-tuning capabilities.*
This model was contributed by [hlky](https://huggingface.co/hlky).
The original code can be found [here](https://huggingface.co/microsoft/Florence-2-base/tree/main).
## Florence2Config
[[autodoc]] Florence2Config
- all
## Florence2Model
## Florence2Processor
[[autodoc]] Florence2Model
- forward
[[autodoc]] Florence2Processor
## Florence2ForConditionalGeneration
[[autodoc]] Florence2ForConditionalGeneration
- forward
## Florence2ForSequenceClassification
## Florence2LanguageForConditionalGeneration
[[autodoc]] Florence2ForSequenceClassification
[[autodoc]] Florence2LanguageForConditionalGeneration
- forward
## Florence2ForQuestionAnswering
## Florence2LanguageModel
[[autodoc]] Florence2ForQuestionAnswering
[[autodoc]] Florence2LanguageModel
- forward
## Florence2ForCausalLM
## Florence2Vision
[[autodoc]] Florence2ForCausalLM
[[autodoc]] Florence2Vision
- forward
</pt>
<tf>
## Florence2VisionModel
[[autodoc]] Florence2VisionModel
- forward
## Florence2VisionModelWithProjection
[[autodoc]] Florence2VisionModelWithProjection
- forward

View File

@ -119,6 +119,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("fastspeech2_conformer", "FastSpeech2ConformerModel"),
("flaubert", "FlaubertModel"),
("flava", "FlavaModel"),
("florence2", "Florence2ForConditionalGeneration"),
("fnet", "FNetModel"),
("focalnet", "FocalNetModel"),
("fsmt", "FSMTModel"),
@ -884,8 +885,8 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
("blip-2", "Blip2ForConditionalGeneration"),
("chameleon", "ChameleonForConditionalGeneration"),
("emu3", "Emu3ForConditionalGeneration"),
("fuyu", "FuyuForCausalLM"),
("florence2", "Florence2ForConditionalGeneration"),
("fuyu", "FuyuForCausalLM"),
("gemma3", "Gemma3ForConditionalGeneration"),
("git", "GitForCausalLM"),
("got_ocr2", "GotOcr2ForConditionalGeneration"),

View File

@ -322,3 +322,6 @@ class Florence2Config(PretrainedConfig):
self.text_config = Florence2LanguageConfig(**text_config)
super().__init__(**kwargs)
__all__ = ["Florence2Config"]

File diff suppressed because it is too large Load Diff

View File

@ -28,7 +28,7 @@ from ...utils import ( # noqa: F401
logging,
replace_return_docstrings,
)
from ..bart.modeling_bart import BartForConditionalGeneration
from ..bart.modeling_bart import BartForConditionalGeneration, BartPreTrainedModel
from .configuration_florence2 import Florence2Config, Florence2LanguageConfig, Florence2VisionConfig # noqa: F401
@ -984,6 +984,14 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
return x
class Florence2LanguagePreTrainedModel(BartPreTrainedModel):
pass
class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
pass
class Florence2LanguageForConditionalGeneration(BartForConditionalGeneration):
pass
@ -1296,3 +1304,15 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
def _reorder_cache(self, *args, **kwargs):
return self.language_model._reorder_cache(*args, **kwargs)
__all__ = [
"Florence2ForConditionalGeneration",
"Florence2LanguageForConditionalGeneration",
"Florence2LanguageModel",
"Florence2LanguagePreTrainedModel",
"Florence2PreTrainedModel",
"Florence2Vision",
"Florence2VisionModel",
"Florence2VisionModelWithProjection",
]

View File

@ -159,7 +159,7 @@ class Florence2Processor(ProcessorMixin):
"<REGION_TO_OCR>": "What text is in the region {input}?",
}
self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
self.post_processor = Florence2PostProcessor(tokenizer=tokenizer)
super().__init__(image_processor, tokenizer)
@ -513,7 +513,7 @@ class CoordinatesQuantizer(object):
return dequantized_coordinates
class Florence2PostProcesser(object):
class Florence2PostProcessor(object):
r"""
Florence-2 post process for converting text prediction to various tasks results.
@ -1220,3 +1220,6 @@ class Florence2PostProcesser(object):
raise ValueError("task {} is not supported".format(task))
return parsed_dict
__all__ = ["Florence2Processor"]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,63 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
from transformers import AutoProcessor, AutoTokenizer, BartTokenizerFast, Florence2Processor
from transformers.testing_utils import require_vision
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from transformers import CLIPImageProcessor
@require_vision
class Florence2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Florence2Processor
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = CLIPImageProcessor.from_pretrained("microsoft/Florence-2-base")
tokenizer = BartTokenizerFast.from_pretrained("microsoft/Florence-2-base")
processor_kwargs = self.prepare_processor_dict()
processor = Florence2Processor(image_processor, tokenizer, **processor_kwargs)
processor.save_pretrained(self.tmpdirname)
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def tearDown(self):
shutil.rmtree(self.tmpdirname)
@unittest.skip(
"Skip because the model has no processor kwargs except for chat template and"
"chat template is saved as a separate file. Stop skipping this test when the processor"
"has new kwargs saved in config file."
)
def test_processor_to_json_string(self):
pass
def test_can_load_various_tokenizers(self):
for checkpoint in ["microsoft/Florence-2-base"]:
processor = Florence2Processor.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)

View File

@ -164,6 +164,11 @@ IGNORE_NON_TESTED = (
"CsmDepthDecoderForCausalLM", # Building part of bigger (tested) model. Tested implicitly through CsmForConditionalGenerationIntegrationTest.
"CsmDepthDecoderModel", # Building part of bigger (tested) model. Tested implicitly through CsmForConditionalGenerationIntegrationTest.
"CsmBackboneModel", # Building part of bigger (tested) model. Tested implicitly through CsmForConditionalGenerationIntegrationTest.
"Florence2LanguageModel", # Building part of bigger (tested) model. Tested implicitly through Florence2ForConditionalGeneration.
"Florence2Vision", # Building part of bigger (tested) model. Tested implicitly through Florence2ForConditionalGeneration.
"Florence2VisionModel", # Building part of bigger (tested) model. Tested implicitly through Florence2ForConditionalGeneration.
"Florence2VisionModelWithProjection", # Building part of bigger (tested) model. Tested implicitly through Florence2ForConditionalGeneration.
"Florence2LanguageForConditionalGeneration", # Building part of bigger (tested) model. Tested implicitly through Florence2ForConditionalGeneration.
]
)
@ -377,6 +382,11 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"CsmDepthDecoderModel", # Building part of a bigger model
"CsmDepthDecoderForCausalLM", # Building part of a bigger model
"CsmForConditionalGeneration", # Building part of a bigger model
"Florence2LanguageForConditionalGeneration", # Building part of a bigger model
"Florence2LanguageModel", # Building part of a bigger model
"Florence2Vision", # Building part of a bigger model
"Florence2VisionModel", # Building part of a bigger model
"Florence2VisionModelWithProjection", # Building part of a bigger model
]
# DO NOT edit this list!