diff --git a/docs/source/en/model_doc/chinese_clip.md b/docs/source/en/model_doc/chinese_clip.md index c73fee0422f..f44cdbd1456 100644 --- a/docs/source/en/model_doc/chinese_clip.md +++ b/docs/source/en/model_doc/chinese_clip.md @@ -90,6 +90,11 @@ Currently, following scales of pretrained Chinese-CLIP models are available on [[autodoc]] ChineseCLIPImageProcessor - preprocess +## ChineseCLIPImageProcessorFast + +[[autodoc]] ChineseCLIPImageProcessorFast + - preprocess + ## ChineseCLIPFeatureExtractor [[autodoc]] ChineseCLIPFeatureExtractor diff --git a/docs/source/ja/model_doc/chinese_clip.md b/docs/source/ja/model_doc/chinese_clip.md index 8d7dc401d2a..68eff8e4131 100644 --- a/docs/source/ja/model_doc/chinese_clip.md +++ b/docs/source/ja/model_doc/chinese_clip.md @@ -86,6 +86,11 @@ Chinese-CLIP モデルは、[OFA-Sys](https://huggingface.co/OFA-Sys) によっ [[autodoc]] ChineseCLIPImageProcessor - preprocess +## ChineseCLIPImageProcessorFast + +[[autodoc]] ChineseCLIPImageProcessorFast + - preprocess + ## ChineseCLIPFeatureExtractor [[autodoc]] ChineseCLIPFeatureExtractor diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index b5a87308e4f..a5cdeb314fd 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -64,7 +64,7 @@ else: ("blip-2", ("BlipImageProcessor", "BlipImageProcessorFast")), ("bridgetower", ("BridgeTowerImageProcessor",)), ("chameleon", ("ChameleonImageProcessor",)), - ("chinese_clip", ("ChineseCLIPImageProcessor",)), + ("chinese_clip", ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast")), ("clip", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")), ("conditional_detr", ("ConditionalDetrImageProcessor",)), diff --git a/src/transformers/models/chinese_clip/__init__.py b/src/transformers/models/chinese_clip/__init__.py index fc1f002a16a..9c4476b9080 100644 --- a/src/transformers/models/chinese_clip/__init__.py +++ b/src/transformers/models/chinese_clip/__init__.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: from .configuration_chinese_clip import * from .feature_extraction_chinese_clip import * from .image_processing_chinese_clip import * + from .image_processing_chinese_clip_fast import * from .modeling_chinese_clip import * from .processing_chinese_clip import * else: diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip_fast.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip_fast.py new file mode 100644 index 00000000000..a1cb38b8a25 --- /dev/null +++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip_fast.py @@ -0,0 +1,40 @@ +# coding=utf-8 +# Copyright 2025 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for Chinese-CLIP.""" + +from ...image_processing_utils_fast import BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import add_start_docstrings + + +@add_start_docstrings( + "Constructs a fast ChineseCLIP image processor.", + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, +) +class ChineseCLIPImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 224} + default_to_square = False + crop_size = {"height": 224, "width": 224} + do_resize = True + do_center_crop = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + + +__all__ = ["ChineseCLIPImageProcessorFast"] diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py index 958adfdd0ae..3523c782f3a 100644 --- a/src/transformers/models/chinese_clip/processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py @@ -44,7 +44,7 @@ class ChineseCLIPProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "ChineseCLIPImageProcessor" + image_processor_class = ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast") tokenizer_class = ("BertTokenizer", "BertTokenizerFast") def __init__(self, image_processor=None, tokenizer=None, **kwargs): diff --git a/tests/models/chinese_clip/test_image_processing_chinese_clip.py b/tests/models/chinese_clip/test_image_processing_chinese_clip.py index bfacac7a168..7acae860b08 100644 --- a/tests/models/chinese_clip/test_image_processing_chinese_clip.py +++ b/tests/models/chinese_clip/test_image_processing_chinese_clip.py @@ -16,7 +16,7 @@ import unittest from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_vision_available +from transformers.utils import is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -24,6 +24,9 @@ from ...test_image_processing_common import ImageProcessingTestMixin, prepare_im if is_vision_available(): from transformers import ChineseCLIPImageProcessor + if is_torchvision_available(): + from transformers import ChineseCLIPImageProcessorFast + class ChineseCLIPImageProcessingTester: def __init__( @@ -91,6 +94,7 @@ class ChineseCLIPImageProcessingTester: @require_vision class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None + fast_image_processing_class = ChineseCLIPImageProcessorFast if is_torchvision_available() else None def setUp(self): super().setUp() @@ -101,24 +105,26 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): - image_processing = self.image_processing_class(**self.image_processor_dict) - self.assertTrue(hasattr(image_processing, "do_resize")) - self.assertTrue(hasattr(image_processing, "size")) - self.assertTrue(hasattr(image_processing, "do_center_crop")) - self.assertTrue(hasattr(image_processing, "center_crop")) - self.assertTrue(hasattr(image_processing, "do_normalize")) - self.assertTrue(hasattr(image_processing, "image_mean")) - self.assertTrue(hasattr(image_processing, "image_std")) - self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) def test_image_processor_from_dict_with_kwargs(self): - image_processor = self.image_processing_class.from_dict(self.image_processor_dict) - self.assertEqual(image_processor.size, {"height": 224, "width": 224}) - self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 224, "width": 224}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) - image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) - self.assertEqual(image_processor.size, {"shortest_edge": 42}) - self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) @unittest.skip( reason="ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet" @@ -131,6 +137,7 @@ class ChineseCLIPImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase @require_vision class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase): image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None + fast_image_processing_class = ChineseCLIPImageProcessorFast if is_torchvision_available() else None def setUp(self): super().setUp() @@ -142,15 +149,16 @@ class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingTestMixin, unitt return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): - image_processing = self.image_processing_class(**self.image_processor_dict) - self.assertTrue(hasattr(image_processing, "do_resize")) - self.assertTrue(hasattr(image_processing, "size")) - self.assertTrue(hasattr(image_processing, "do_center_crop")) - self.assertTrue(hasattr(image_processing, "center_crop")) - self.assertTrue(hasattr(image_processing, "do_normalize")) - self.assertTrue(hasattr(image_processing, "image_mean")) - self.assertTrue(hasattr(image_processing, "image_std")) - self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) @unittest.skip(reason="ChineseCLIPImageProcessor does not support 4 channels yet") # FIXME Amy def test_call_numpy(self):