diff --git a/docs/source/en/model_doc/bit.md b/docs/source/en/model_doc/bit.md index 550c07662dd..0813b67af9e 100644 --- a/docs/source/en/model_doc/bit.md +++ b/docs/source/en/model_doc/bit.md @@ -58,6 +58,11 @@ If you're interested in submitting a resource to be included here, please feel f [[autodoc]] BitImageProcessor - preprocess +## BitImageProcessorFast + +[[autodoc]] BitImageProcessorFast + - preprocess + ## BitModel [[autodoc]] BitModel diff --git a/docs/source/ja/model_doc/bit.md b/docs/source/ja/model_doc/bit.md index 76b24fa6447..ab0a7a4c685 100644 --- a/docs/source/ja/model_doc/bit.md +++ b/docs/source/ja/model_doc/bit.md @@ -54,6 +54,11 @@ BiT を始めるのに役立つ公式 Hugging Face およびコミュニティ ( [[autodoc]] BitImageProcessor - preprocess +## BitImageProcessorFast + +[[autodoc]] BitImageProcessorFast + - preprocess + ## BitModel [[autodoc]] BitModel diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 6387baa20cd..5e8dae8326e 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -59,7 +59,7 @@ else: ("align", ("EfficientNetImageProcessor",)), ("aria", ("AriaImageProcessor",)), ("beit", ("BeitImageProcessor",)), - ("bit", ("BitImageProcessor",)), + ("bit", ("BitImageProcessor", "BitImageProcessorFast")), ("blip", ("BlipImageProcessor", "BlipImageProcessorFast")), ("blip-2", ("BlipImageProcessor", "BlipImageProcessorFast")), ("bridgetower", ("BridgeTowerImageProcessor",)), @@ -79,13 +79,13 @@ else: ("deta", ("DetaImageProcessor",)), ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")), ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")), - ("dinov2", ("BitImageProcessor",)), + ("dinov2", ("BitImageProcessor", "BitImageProcessorFast")), ("donut-swin", ("DonutImageProcessor", "DonutImageProcessorFast")), ("dpt", ("DPTImageProcessor",)), ("efficientformer", ("EfficientFormerImageProcessor",)), ("efficientnet", ("EfficientNetImageProcessor",)), ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")), - ("focalnet", ("BitImageProcessor",)), + ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")), ("fuyu", ("FuyuImageProcessor",)), ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")), ("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")), @@ -93,7 +93,7 @@ else: ("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")), ("grounding-dino", ("GroundingDinoImageProcessor",)), ("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")), - ("hiera", ("BitImageProcessor",)), + ("hiera", ("BitImageProcessor", "BitImageProcessorFast")), ("idefics", ("IdeficsImageProcessor",)), ("idefics2", ("Idefics2ImageProcessor",)), ("idefics3", ("Idefics3ImageProcessor",)), diff --git a/src/transformers/models/bit/__init__.py b/src/transformers/models/bit/__init__.py index 3b6ba91b032..edfeb4dbe75 100644 --- a/src/transformers/models/bit/__init__.py +++ b/src/transformers/models/bit/__init__.py @@ -20,6 +20,7 @@ from ...utils.import_utils import define_import_structure if TYPE_CHECKING: from .configuration_bit import * from .image_processing_bit import * + from .image_processing_bit_fast import * from .modeling_bit import * else: import sys diff --git a/src/transformers/models/bit/image_processing_bit_fast.py b/src/transformers/models/bit/image_processing_bit_fast.py new file mode 100644 index 00000000000..19b3fbfd32b --- /dev/null +++ b/src/transformers/models/bit/image_processing_bit_fast.py @@ -0,0 +1,44 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for BiT.""" + +from ...image_processing_utils_fast import ( + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, + BaseImageProcessorFast, +) +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import add_start_docstrings + + +@add_start_docstrings( + "Constructs a fast Bit image processor.", + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, +) +class BitImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 224} + default_to_square = False + crop_size = {"height": 224, "width": 224} + rescale_factor = 1 / 255 + do_resize = True + do_center_crop = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + + +__all__ = ["BitImageProcessorFast"] diff --git a/tests/models/bit/test_image_processing_bit.py b/tests/models/bit/test_image_processing_bit.py new file mode 100644 index 00000000000..8ad72a4c5b9 --- /dev/null +++ b/tests/models/bit/test_image_processing_bit.py @@ -0,0 +1,128 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torchvision_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from transformers import BitImageProcessor + + if is_torchvision_available(): + from transformers import BitImageProcessorFast + + +class BitImageProcessingTester: + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + do_convert_rgb=True, + ): + super().__init__() + size = size if size is not None else {"shortest_edge": 20} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class BitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = BitImageProcessor if is_vision_available() else None + fast_image_processing_class = BitImageProcessorFast if is_torchvision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = BitImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + def test_image_processor_from_dict_with_kwargs(self): + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 20}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})