Remove differences between init and preprocess kwargs for fast image processors (#36186)

* Remove differences between init and preprocess kwargs in fast image processors

* make modifs got_ocr2

* update gemma3
This commit is contained in:
Yoni Gozlan 2025-03-12 19:44:05 -04:00 committed by GitHub
parent cc3a361b46
commit ea219ed164
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 136 additions and 198 deletions

View File

@ -126,7 +126,7 @@ def divide_to_patches(
return patches return patches
class DefaultFastImageProcessorInitKwargs(TypedDict, total=False): class DefaultFastImageProcessorKwargs(TypedDict, total=False):
do_resize: Optional[bool] do_resize: Optional[bool]
size: Optional[Dict[str, int]] size: Optional[Dict[str, int]]
default_to_square: Optional[bool] default_to_square: Optional[bool]
@ -139,9 +139,6 @@ class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
image_mean: Optional[Union[float, List[float]]] image_mean: Optional[Union[float, List[float]]]
image_std: Optional[Union[float, List[float]]] image_std: Optional[Union[float, List[float]]]
do_convert_rgb: Optional[bool] do_convert_rgb: Optional[bool]
class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwargs):
return_tensors: Optional[Union[str, TensorType]] return_tensors: Optional[Union[str, TensorType]]
data_format: Optional[ChannelDimension] data_format: Optional[ChannelDimension]
input_data_format: Optional[Union[str, ChannelDimension]] input_data_format: Optional[Union[str, ChannelDimension]]
@ -185,8 +182,20 @@ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING = r"""
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
Can be overridden by the `image_std` parameter in the `preprocess` method. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_rgb (`bool`, *optional*, defaults to `self.image_std`): do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.""" Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
device (`torch.device`, *optional*, defaults to `self.device`):
The device to process the images on. If unset, the device is inferred from the input images."""
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r""" BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
Preprocess an image or batch of images. Preprocess an image or batch of images.
@ -219,20 +228,17 @@ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
`True`. `True`.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB. Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*): return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
Returns stacked tensors if set to `pt, otherwise returns a list of tensors. Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
The channel dimension format for the output image. Can be one of: Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of: from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
device (`torch.device`, *optional*): device (`torch.device`, *optional*, defaults to `self.device`):
The device to process the images on. If unset, the device is inferred from the input images.""" The device to process the images on. If unset, the device is inferred from the input images."""
@ -253,13 +259,16 @@ class BaseImageProcessorFast(BaseImageProcessor):
rescale_factor = 1 / 255 rescale_factor = 1 / 255
do_normalize = None do_normalize = None
do_convert_rgb = None do_convert_rgb = None
return_tensors = None
data_format = ChannelDimension.FIRST
input_data_format = None
device = None
model_input_names = ["pixel_values"] model_input_names = ["pixel_values"]
valid_init_kwargs = DefaultFastImageProcessorInitKwargs valid_kwargs = DefaultFastImageProcessorKwargs
valid_preprocess_kwargs = DefaultFastImageProcessorPreprocessKwargs
def __init__( def __init__(
self, self,
**kwargs: Unpack[DefaultFastImageProcessorInitKwargs], **kwargs: Unpack[DefaultFastImageProcessorKwargs],
) -> None: ) -> None:
super().__init__(**kwargs) super().__init__(**kwargs)
size = kwargs.pop("size", self.size) size = kwargs.pop("size", self.size)
@ -270,7 +279,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
) )
crop_size = kwargs.pop("crop_size", self.crop_size) crop_size = kwargs.pop("crop_size", self.crop_size)
self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None
for key in self.valid_init_kwargs.__annotations__.keys(): for key in self.valid_kwargs.__annotations__.keys():
kwarg = kwargs.pop(key, None) kwarg = kwargs.pop(key, None)
if kwarg is not None: if kwarg is not None:
setattr(self, key, kwarg) setattr(self, key, kwarg)
@ -553,14 +562,12 @@ class BaseImageProcessorFast(BaseImageProcessor):
def preprocess( def preprocess(
self, self,
images: ImageInput, images: ImageInput,
**kwargs: Unpack[DefaultFastImageProcessorPreprocessKwargs], **kwargs: Unpack[DefaultFastImageProcessorKwargs],
) -> BatchFeature: ) -> BatchFeature:
validate_kwargs( validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys()
)
# Set default kwargs from self. This ensures that if a kwarg is not provided # Set default kwargs from self. This ensures that if a kwarg is not provided
# by the user, it gets its default value from the instance, or is set to None. # by the user, it gets its default value from the instance, or is set to None.
for kwarg_name in self.valid_preprocess_kwargs.__annotations__: for kwarg_name in self.valid_kwargs.__annotations__:
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None)) kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
# Extract parameters that are only used for preparing the input images # Extract parameters that are only used for preparing the input images

View File

@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast, BaseImageProcessorFast,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
DefaultFastImageProcessorPreprocessKwargs,
group_images_by_shape, group_images_by_shape,
reorder_images, reorder_images,
) )
@ -54,11 +53,7 @@ if is_torchvision_available():
from torchvision.transforms import functional as F from torchvision.transforms import functional as F
class ConvNextFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
crop_pct: Optional[float]
class ConvNextFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
crop_pct: Optional[float] crop_pct: Optional[float]
@ -81,10 +76,9 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
do_rescale = True do_rescale = True
do_normalize = True do_normalize = True
crop_pct = 224 / 256 crop_pct = 224 / 256
valid_init_kwargs = ConvNextFastImageProcessorInitKwargs valid_kwargs = ConvNextFastImageProcessorKwargs
valid_preprocess_kwargs = ConvNextFastImageProcessorPreprocessKwargs
def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorInitKwargs]): def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]):
super().__init__(**kwargs) super().__init__(**kwargs)
@add_start_docstrings( @add_start_docstrings(
@ -95,9 +89,7 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
overridden by `crop_pct` in the`preprocess` method. overridden by `crop_pct` in the`preprocess` method.
""", """,
) )
def preprocess( def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]) -> BatchFeature:
self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorPreprocessKwargs]
) -> BatchFeature:
return super().preprocess(images, **kwargs) return super().preprocess(images, **kwargs)
def resize( def resize(

View File

@ -12,8 +12,7 @@ from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast, BaseImageProcessorFast,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
DefaultFastImageProcessorPreprocessKwargs,
SizeDict, SizeDict,
get_image_size_for_max_height_width, get_image_size_for_max_height_width,
get_max_height_width, get_max_height_width,
@ -58,21 +57,12 @@ elif is_torchvision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class DeformableDetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
format: Optional[Union[str, AnnotationFormat]] format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool] do_convert_annotations: Optional[bool]
do_pad: Optional[bool] do_pad: Optional[bool]
pad_size: Optional[Dict[str, int]] pad_size: Optional[Dict[str, int]]
class DeformableDetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
format: Optional[AnnotationFormat]
annotations: Optional[Dict]
do_convert_annotations: Optional[bool]
do_pad: Optional[bool]
pad_size: Optional[Dict[str, int]]
return_segmentation_masks: Optional[bool] return_segmentation_masks: Optional[bool]
masks_path: Optional[Union[str, pathlib.Path]]
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@ -294,6 +284,8 @@ def prepare_coco_panoptic_annotation(
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch. height and width in the batch.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
""", """,
) )
class DeformableDetrImageProcessorFast(BaseImageProcessorFast): class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
@ -308,10 +300,9 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
size = {"shortest_edge": 800, "longest_edge": 1333} size = {"shortest_edge": 800, "longest_edge": 1333}
default_to_square = False default_to_square = False
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
valid_init_kwargs = DeformableDetrFastImageProcessorInitKwargs valid_kwargs = DeformableDetrFastImageProcessorKwargs
valid_preprocess_kwargs = DeformableDetrFastImageProcessorPreprocessKwargs
def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorInitKwargs]) -> None: def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None:
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
@ -605,7 +596,11 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
""", """,
) )
def preprocess( def preprocess(
self, images: ImageInput, **kwargs: Unpack[DeformableDetrFastImageProcessorPreprocessKwargs] self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
**kwargs: Unpack[DeformableDetrFastImageProcessorKwargs],
) -> BatchFeature: ) -> BatchFeature:
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
@ -621,7 +616,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
) )
kwargs["size"] = kwargs.pop("max_size") kwargs["size"] = kwargs.pop("max_size")
return super().preprocess(images, **kwargs) return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
def _preprocess( def _preprocess(
self, self,

View File

@ -24,8 +24,7 @@ from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast, BaseImageProcessorFast,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
DefaultFastImageProcessorPreprocessKwargs,
SizeDict, SizeDict,
get_image_size_for_max_height_width, get_image_size_for_max_height_width,
get_max_height_width, get_max_height_width,
@ -283,21 +282,12 @@ def prepare_coco_panoptic_annotation(
return new_target return new_target
class DetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
format: Optional[Union[str, AnnotationFormat]] format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool] do_convert_annotations: Optional[bool]
do_pad: Optional[bool] do_pad: Optional[bool]
pad_size: Optional[Dict[str, int]] pad_size: Optional[Dict[str, int]]
class DetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
format: Optional[AnnotationFormat]
annotations: Optional[Dict]
do_convert_annotations: Optional[bool]
do_pad: Optional[bool]
pad_size: Optional[Dict[str, int]]
return_segmentation_masks: Optional[bool] return_segmentation_masks: Optional[bool]
masks_path: Optional[Union[str, pathlib.Path]]
@add_start_docstrings( @add_start_docstrings(
@ -319,6 +309,8 @@ class DetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocess
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch. height and width in the batch.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
""", """,
) )
class DetrImageProcessorFast(BaseImageProcessorFast): class DetrImageProcessorFast(BaseImageProcessorFast):
@ -333,10 +325,9 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
size = {"shortest_edge": 800, "longest_edge": 1333} size = {"shortest_edge": 800, "longest_edge": 1333}
default_to_square = False default_to_square = False
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
valid_init_kwargs = DetrFastImageProcessorInitKwargs valid_kwargs = DetrFastImageProcessorKwargs
valid_preprocess_kwargs = DetrFastImageProcessorPreprocessKwargs
def __init__(self, **kwargs: Unpack[DetrFastImageProcessorInitKwargs]) -> None: def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None:
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
@ -629,7 +620,13 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
Path to the directory containing the segmentation masks. Path to the directory containing the segmentation masks.
""", """,
) )
def preprocess(self, images: ImageInput, **kwargs: Unpack[DetrFastImageProcessorPreprocessKwargs]) -> BatchFeature: def preprocess(
self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
**kwargs: Unpack[DetrFastImageProcessorKwargs],
) -> BatchFeature:
if "pad_and_return_pixel_mask" in kwargs: if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
logger.warning_once( logger.warning_once(
@ -644,7 +641,7 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
) )
kwargs["size"] = kwargs.pop("max_size") kwargs["size"] = kwargs.pop("max_size")
return super().preprocess(images, **kwargs) return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
def _preprocess( def _preprocess(
self, self,

View File

@ -24,8 +24,7 @@ from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast, BaseImageProcessorFast,
BatchFeature, BatchFeature,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
DefaultFastImageProcessorPreprocessKwargs,
get_size_dict, get_size_dict,
group_images_by_shape, group_images_by_shape,
reorder_images, reorder_images,
@ -67,14 +66,7 @@ if is_torchvision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class Gemma3FastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
do_pan_and_scan: Optional[bool]
pan_and_scan_min_crop_size: Optional[int]
pan_and_scan_max_num_crops: Optional[int]
pan_and_scan_min_ratio_to_activate: Optional[float]
class Gemma3FastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
do_pan_and_scan: Optional[bool] do_pan_and_scan: Optional[bool]
pan_and_scan_min_crop_size: Optional[int] pan_and_scan_min_crop_size: Optional[int]
pan_and_scan_max_num_crops: Optional[int] pan_and_scan_max_num_crops: Optional[int]
@ -108,10 +100,9 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
pan_and_scan_min_crop_size = None pan_and_scan_min_crop_size = None
pan_and_scan_max_num_crops = None pan_and_scan_max_num_crops = None
pan_and_scan_min_ratio_to_activate = None pan_and_scan_min_ratio_to_activate = None
valid_init_kwargs = Gemma3FastImageProcessorInitKwargs valid_kwargs = Gemma3FastImageProcessorKwargs
valid_preprocess_kwargs = Gemma3FastImageProcessorPreprocessKwargs
def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorInitKwargs]): def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]):
super().__init__(**kwargs) super().__init__(**kwargs)
def _prepare_images_structure( def _prepare_images_structure(
@ -262,14 +253,12 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
def preprocess( def preprocess(
self, self,
images: ImageInput, images: ImageInput,
**kwargs: Unpack[Gemma3FastImageProcessorPreprocessKwargs], **kwargs: Unpack[Gemma3FastImageProcessorKwargs],
) -> BatchFeature: ) -> BatchFeature:
validate_kwargs( validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys()
)
# Set default kwargs from self. This ensures that if a kwarg is not provided # Set default kwargs from self. This ensures that if a kwarg is not provided
# by the user, it gets its default value from the instance, or is set to None. # by the user, it gets its default value from the instance, or is set to None.
for kwarg_name in self.valid_preprocess_kwargs.__annotations__: for kwarg_name in self.valid_kwargs.__annotations__:
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None)) kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
# Extract parameters that are only used for preparing the input images # Extract parameters that are only used for preparing the input images

View File

@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast, BaseImageProcessorFast,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
DefaultFastImageProcessorPreprocessKwargs,
group_images_by_shape, group_images_by_shape,
reorder_images, reorder_images,
) )
@ -54,13 +53,7 @@ if is_torchvision_available():
from torchvision.transforms import functional as F from torchvision.transforms import functional as F
class GotOcr2ImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class GotOcr2ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
crop_to_patches: Optional[bool]
min_patches: Optional[int]
max_patches: Optional[int]
class GotOcr2ImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
crop_to_patches: Optional[bool] crop_to_patches: Optional[bool]
min_patches: Optional[int] min_patches: Optional[int]
max_patches: Optional[int] max_patches: Optional[int]
@ -93,10 +86,9 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
crop_to_patches = False crop_to_patches = False
min_patches = 1 min_patches = 1
max_patches = 12 max_patches = 12
valid_init_kwargs = GotOcr2ImageProcessorInitKwargs valid_kwargs = GotOcr2ImageProcessorKwargs
valid_preprocess_kwargs = GotOcr2ImageProcessorPreprocessKwargs
def __init__(self, **kwargs: Unpack[GotOcr2ImageProcessorInitKwargs]): def __init__(self, **kwargs: Unpack[valid_kwargs]):
super().__init__(**kwargs) super().__init__(**kwargs)
@add_start_docstrings( @add_start_docstrings(
@ -113,7 +105,7 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
""", """,
) )
def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2ImageProcessorPreprocessKwargs]) -> BatchFeature: def preprocess(self, images: ImageInput, **kwargs: Unpack[valid_kwargs]) -> BatchFeature:
return super().preprocess(images, **kwargs) return super().preprocess(images, **kwargs)
def crop_image_to_patches( def crop_image_to_patches(

View File

@ -23,8 +23,7 @@ from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast, BaseImageProcessorFast,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
DefaultFastImageProcessorPreprocessKwargs,
group_images_by_shape, group_images_by_shape,
reorder_images, reorder_images,
) )
@ -61,11 +60,7 @@ if is_torchvision_available():
from torchvision.transforms import functional as F from torchvision.transforms import functional as F
class LlavaFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
do_pad: Optional[bool]
class LlavaFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
do_pad: Optional[bool] do_pad: Optional[bool]
@ -90,10 +85,9 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
do_rescale = True do_rescale = True
do_normalize = True do_normalize = True
do_convert_rgb = True do_convert_rgb = True
valid_init_kwargs = LlavaFastImageProcessorInitKwargs valid_kwargs = LlavaFastImageProcessorKwargs
valid_preprocess_kwargs = LlavaFastImageProcessorPreprocessKwargs
def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorInitKwargs]) -> None: def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None:
super().__init__(**kwargs) super().__init__(**kwargs)
@add_start_docstrings( @add_start_docstrings(
@ -103,9 +97,7 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
""", """,
) )
def preprocess( def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature:
self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorPreprocessKwargs]
) -> BatchFeature:
return super().preprocess(images, **kwargs) return super().preprocess(images, **kwargs)
def pad_to_square( def pad_to_square(

View File

@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast, BaseImageProcessorFast,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
DefaultFastImageProcessorPreprocessKwargs,
divide_to_patches, divide_to_patches,
group_images_by_shape, group_images_by_shape,
reorder_images, reorder_images,
@ -57,12 +56,7 @@ if is_torchvision_available():
from torchvision.transforms import functional as F from torchvision.transforms import functional as F
class LlavaNextFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class LlavaNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
image_grid_pinpoints: Optional[List[List[int]]]
do_pad: Optional[bool]
class LlavaNextFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
image_grid_pinpoints: Optional[List[List[int]]] image_grid_pinpoints: Optional[List[List[int]]]
do_pad: Optional[bool] do_pad: Optional[bool]
@ -96,10 +90,9 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
do_convert_rgb = True do_convert_rgb = True
do_pad = True do_pad = True
image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
valid_init_kwargs = LlavaNextFastImageProcessorInitKwargs valid_kwargs = LlavaNextFastImageProcessorKwargs
valid_preprocess_kwargs = LlavaNextFastImageProcessorPreprocessKwargs
def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorInitKwargs]): def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]):
super().__init__(**kwargs) super().__init__(**kwargs)
@add_start_docstrings( @add_start_docstrings(
@ -113,9 +106,7 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
number of patches in the batch. Padding will be applied to the bottom and right with zeros. number of patches in the batch. Padding will be applied to the bottom and right with zeros.
""", """,
) )
def preprocess( def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature:
self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorPreprocessKwargs]
) -> BatchFeature:
return super().preprocess(images, **kwargs) return super().preprocess(images, **kwargs)
def _prepare_images_structure( def _prepare_images_structure(

View File

@ -12,8 +12,7 @@ from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast, BaseImageProcessorFast,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
DefaultFastImageProcessorPreprocessKwargs,
divide_to_patches, divide_to_patches,
group_images_by_shape, group_images_by_shape,
reorder_images, reorder_images,
@ -40,12 +39,7 @@ else:
from torchvision.transforms import functional as F from torchvision.transforms import functional as F
class LlavaOnevisionFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
image_grid_pinpoints: Optional[List[List[int]]]
do_pad: Optional[bool]
class LlavaOnevisionFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
image_grid_pinpoints: Optional[List[List[int]]] image_grid_pinpoints: Optional[List[List[int]]]
do_pad: Optional[bool] do_pad: Optional[bool]
@ -77,11 +71,10 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
do_convert_rgb = True do_convert_rgb = True
do_pad = True do_pad = True
image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]] # fmt: skip image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]] # fmt: skip
valid_init_kwargs = LlavaOnevisionFastImageProcessorInitKwargs valid_kwargs = LlavaOnevisionFastImageProcessorKwargs
valid_preprocess_kwargs = LlavaOnevisionFastImageProcessorPreprocessKwargs
model_input_names = ["pixel_values_videos"] model_input_names = ["pixel_values_videos"]
def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorInitKwargs]): def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]):
super().__init__(**kwargs) super().__init__(**kwargs)
@add_start_docstrings( @add_start_docstrings(
@ -95,9 +88,7 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
number of patches in the batch. Padding will be applied to the bottom and right with zeros. number of patches in the batch. Padding will be applied to the bottom and right with zeros.
""", """,
) )
def preprocess( def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature:
self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorPreprocessKwargs]
) -> BatchFeature:
return super().preprocess(images, **kwargs) return super().preprocess(images, **kwargs)
def _prepare_images_structure( def _prepare_images_structure(

View File

@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast, BaseImageProcessorFast,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
DefaultFastImageProcessorPreprocessKwargs,
group_images_by_shape, group_images_by_shape,
reorder_images, reorder_images,
) )
@ -61,11 +60,7 @@ if is_torchvision_available():
from torchvision.transforms import functional as F from torchvision.transforms import functional as F
class PixtralFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class PixtralFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
patch_size: Optional[Dict[str, int]]
class PixtralFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
patch_size: Optional[Dict[str, int]] patch_size: Optional[Dict[str, int]]
@ -88,10 +83,9 @@ class PixtralImageProcessorFast(BaseImageProcessorFast):
do_rescale = True do_rescale = True
do_normalize = True do_normalize = True
do_convert_rgb = True do_convert_rgb = True
valid_init_kwargs = PixtralFastImageProcessorInitKwargs valid_kwargs = PixtralFastImageProcessorKwargs
valid_preprocess_kwargs = PixtralFastImageProcessorPreprocessKwargs
def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorInitKwargs]): def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorKwargs]):
super().__init__(**kwargs) super().__init__(**kwargs)
@add_start_docstrings( @add_start_docstrings(
@ -101,9 +95,7 @@ class PixtralImageProcessorFast(BaseImageProcessorFast):
Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method. Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
""", """,
) )
def preprocess( def preprocess(self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorKwargs]) -> BatchFeature:
self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorPreprocessKwargs]
) -> BatchFeature:
return super().preprocess(images, **kwargs) return super().preprocess(images, **kwargs)
def resize( def resize(

View File

@ -25,7 +25,7 @@ from ...image_processing_utils import BatchFeature
from ...image_processing_utils_fast import ( from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BaseImageProcessorFast, BaseImageProcessorFast,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
group_images_by_shape, group_images_by_shape,
reorder_images, reorder_images,
) )
@ -69,7 +69,7 @@ elif is_torchvision_available():
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
class Qwen2VLFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class Qwen2VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
min_pixels: Optional[int] min_pixels: Optional[int]
max_pixels: Optional[int] max_pixels: Optional[int]
patch_size: Optional[int] patch_size: Optional[int]
@ -107,10 +107,10 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
merge_size = 2 merge_size = 2
min_pixels = 56 * 56 min_pixels = 56 * 56
max_pixels = 28 * 28 * 1280 max_pixels = 28 * 28 * 1280
valid_init_kwargs = Qwen2VLFastImageProcessorInitKwargs valid_kwargs = DefaultFastImageProcessorKwargs
model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"] model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorInitKwargs]): def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs]):
super().__init__(**kwargs) super().__init__(**kwargs)
def _preprocess( def _preprocess(

View File

@ -12,8 +12,7 @@ from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast, BaseImageProcessorFast,
DefaultFastImageProcessorInitKwargs, DefaultFastImageProcessorKwargs,
DefaultFastImageProcessorPreprocessKwargs,
SizeDict, SizeDict,
add_start_docstrings, add_start_docstrings,
get_image_size_for_max_height_width, get_image_size_for_max_height_width,
@ -53,21 +52,12 @@ elif is_torchvision_available():
from torchvision.transforms import functional as F from torchvision.transforms import functional as F
class RTDetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs): class RTDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
format: Optional[Union[str, AnnotationFormat]] format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool] do_convert_annotations: Optional[bool]
do_pad: Optional[bool] do_pad: Optional[bool]
pad_size: Optional[Dict[str, int]] pad_size: Optional[Dict[str, int]]
class RTDetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
format: Optional[AnnotationFormat]
annotations: Optional[Dict]
do_convert_annotations: Optional[bool]
do_pad: Optional[bool]
pad_size: Optional[Dict[str, int]]
return_segmentation_masks: Optional[bool] return_segmentation_masks: Optional[bool]
masks_path: Optional[Union[str, pathlib.Path]]
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@ -151,6 +141,8 @@ def prepare_coco_detection_annotation(
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch. height and width in the batch.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
""", """,
) )
class RTDetrImageProcessorFast(BaseImageProcessorFast): class RTDetrImageProcessorFast(BaseImageProcessorFast):
@ -165,11 +157,10 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
size = {"height": 640, "width": 640} size = {"height": 640, "width": 640}
default_to_square = False default_to_square = False
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
valid_init_kwargs = RTDetrFastImageProcessorInitKwargs valid_kwargs = RTDetrFastImageProcessorKwargs
valid_preprocess_kwargs = RTDetrFastImageProcessorPreprocessKwargs
do_convert_annotations = True do_convert_annotations = True
def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None: def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
# Backwards compatibility # Backwards compatibility
do_convert_annotations = kwargs.get("do_convert_annotations", None) do_convert_annotations = kwargs.get("do_convert_annotations", None)
do_normalize = kwargs.get("do_normalize", None) do_normalize = kwargs.get("do_normalize", None)
@ -424,9 +415,13 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
""", """,
) )
def preprocess( def preprocess(
self, images: ImageInput, **kwargs: Unpack[RTDetrFastImageProcessorPreprocessKwargs] self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
**kwargs: Unpack[RTDetrFastImageProcessorKwargs],
) -> BatchFeature: ) -> BatchFeature:
return super().preprocess(images, **kwargs) return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
def _preprocess( def _preprocess(
self, self,

View File

@ -2,8 +2,7 @@ import pathlib
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
from transformers.models.detr.image_processing_detr_fast import ( from transformers.models.detr.image_processing_detr_fast import (
DetrFastImageProcessorInitKwargs, DetrFastImageProcessorKwargs,
DetrFastImageProcessorPreprocessKwargs,
DetrImageProcessorFast, DetrImageProcessorFast,
) )
@ -112,11 +111,7 @@ def prepare_coco_detection_annotation(
return new_target return new_target
class RTDetrFastImageProcessorInitKwargs(DetrFastImageProcessorInitKwargs): class RTDetrFastImageProcessorKwargs(DetrFastImageProcessorKwargs):
pass
class RTDetrFastImageProcessorPreprocessKwargs(DetrFastImageProcessorPreprocessKwargs):
pass pass
@ -133,10 +128,9 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
size = {"height": 640, "width": 640} size = {"height": 640, "width": 640}
default_to_square = False default_to_square = False
model_input_names = ["pixel_values", "pixel_mask"] model_input_names = ["pixel_values", "pixel_mask"]
valid_init_kwargs = RTDetrFastImageProcessorInitKwargs valid_kwargs = RTDetrFastImageProcessorKwargs
valid_preprocess_kwargs = RTDetrFastImageProcessorPreprocessKwargs
def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None: def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
# Backwards compatibility # Backwards compatibility
do_convert_annotations = kwargs.get("do_convert_annotations", None) do_convert_annotations = kwargs.get("do_convert_annotations", None)
do_normalize = kwargs.get("do_normalize", None) do_normalize = kwargs.get("do_normalize", None)
@ -181,9 +175,13 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
""", """,
) )
def preprocess( def preprocess(
self, images: ImageInput, **kwargs: Unpack[RTDetrFastImageProcessorPreprocessKwargs] self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
**kwargs: Unpack[RTDetrFastImageProcessorKwargs],
) -> BatchFeature: ) -> BatchFeature:
return BaseImageProcessorFast().preprocess(images, **kwargs) return BaseImageProcessorFast().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
def prepare_annotation( def prepare_annotation(
self, self,

View File

@ -311,8 +311,10 @@ class ImageProcessingTestMixin:
} }
dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)} dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)}
dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)} dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)}
# check that all additional keys are None, except for `default_to_square` which is only set in fast processors # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"])) self.assertTrue(
all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
)
# check that the remaining keys are the same # check that the remaining keys are the same
self.assertEqual(dict_slow_0, dict_slow_1) self.assertEqual(dict_slow_0, dict_slow_1)
@ -324,8 +326,10 @@ class ImageProcessingTestMixin:
} }
dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)} dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)}
dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)} dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)}
# check that all additional keys are None, except for `default_to_square` which is only set in fast processors # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"])) self.assertTrue(
all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
)
# check that the remaining keys are the same # check that the remaining keys are the same
self.assertEqual(dict_fast_0, dict_fast_1) self.assertEqual(dict_fast_0, dict_fast_1)
@ -357,8 +361,10 @@ class ImageProcessingTestMixin:
} }
dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)} dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)}
dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)} dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)}
# check that all additional keys are None, except for `default_to_square` which is only set in fast processors # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"])) self.assertTrue(
all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
)
# check that the remaining keys are the same # check that the remaining keys are the same
self.assertEqual(dict_slow_0, dict_slow_1) self.assertEqual(dict_slow_0, dict_slow_1)
@ -370,8 +376,10 @@ class ImageProcessingTestMixin:
} }
dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)} dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)}
dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)} dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)}
# check that all additional keys are None, except for `default_to_square` which is only set in fast processors # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"])) self.assertTrue(
all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
)
# check that the remaining keys are the same # check that the remaining keys are the same
self.assertEqual(dict_fast_0, dict_fast_1) self.assertEqual(dict_fast_0, dict_fast_1)

View File

@ -1087,8 +1087,7 @@ TYPE_TO_FILE_TYPE = {
"Processor": "processing", "Processor": "processing",
"ImageProcessor": "image_processing", "ImageProcessor": "image_processing",
"ImageProcessorFast": "image_processing*_fast", # "*" indicates where to insert the model name before the "_fast" suffix "ImageProcessorFast": "image_processing*_fast", # "*" indicates where to insert the model name before the "_fast" suffix
"FastImageProcessorInitKwargs": "image_processing*_fast", "FastImageProcessorKwargs": "image_processing*_fast",
"FastImageProcessorPreprocessKwargs": "image_processing*_fast",
"FeatureExtractor": "feature_extractor", "FeatureExtractor": "feature_extractor",
"ProcessorKwargs": "processing", "ProcessorKwargs": "processing",
"ImagesKwargs": "processing", "ImagesKwargs": "processing",