mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 11:11:05 +06:00
Remove differences between init and preprocess kwargs for fast image processors (#36186)
* Remove differences between init and preprocess kwargs in fast image processors * make modifs got_ocr2 * update gemma3
This commit is contained in:
parent
cc3a361b46
commit
ea219ed164
@ -126,7 +126,7 @@ def divide_to_patches(
|
|||||||
return patches
|
return patches
|
||||||
|
|
||||||
|
|
||||||
class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
|
class DefaultFastImageProcessorKwargs(TypedDict, total=False):
|
||||||
do_resize: Optional[bool]
|
do_resize: Optional[bool]
|
||||||
size: Optional[Dict[str, int]]
|
size: Optional[Dict[str, int]]
|
||||||
default_to_square: Optional[bool]
|
default_to_square: Optional[bool]
|
||||||
@ -139,9 +139,6 @@ class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
|
|||||||
image_mean: Optional[Union[float, List[float]]]
|
image_mean: Optional[Union[float, List[float]]]
|
||||||
image_std: Optional[Union[float, List[float]]]
|
image_std: Optional[Union[float, List[float]]]
|
||||||
do_convert_rgb: Optional[bool]
|
do_convert_rgb: Optional[bool]
|
||||||
|
|
||||||
|
|
||||||
class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwargs):
|
|
||||||
return_tensors: Optional[Union[str, TensorType]]
|
return_tensors: Optional[Union[str, TensorType]]
|
||||||
data_format: Optional[ChannelDimension]
|
data_format: Optional[ChannelDimension]
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]]
|
input_data_format: Optional[Union[str, ChannelDimension]]
|
||||||
@ -185,8 +182,20 @@ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING = r"""
|
|||||||
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
|
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
|
||||||
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
Can be overridden by the `image_std` parameter in the `preprocess` method.
|
Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||||
do_convert_rgb (`bool`, *optional*, defaults to `self.image_std`):
|
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||||
Whether to convert the image to RGB."""
|
Whether to convert the image to RGB.
|
||||||
|
return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
|
||||||
|
Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
|
||||||
|
data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
|
||||||
|
Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
|
||||||
|
input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
|
||||||
|
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||||
|
from the input image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||||
|
device (`torch.device`, *optional*, defaults to `self.device`):
|
||||||
|
The device to process the images on. If unset, the device is inferred from the input images."""
|
||||||
|
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
|
||||||
Preprocess an image or batch of images.
|
Preprocess an image or batch of images.
|
||||||
@ -219,20 +228,17 @@ BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
|
|||||||
`True`.
|
`True`.
|
||||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||||
Whether to convert the image to RGB.
|
Whether to convert the image to RGB.
|
||||||
return_tensors (`str` or `TensorType`, *optional*):
|
return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
|
||||||
Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
|
Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
|
||||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
|
||||||
The channel dimension format for the output image. Can be one of:
|
Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
|
||||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
|
||||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
|
||||||
- Unset: Use the channel dimension format of the input image.
|
|
||||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
|
||||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||||
from the input image. Can be one of:
|
from the input image. Can be one of:
|
||||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||||
device (`torch.device`, *optional*):
|
device (`torch.device`, *optional*, defaults to `self.device`):
|
||||||
The device to process the images on. If unset, the device is inferred from the input images."""
|
The device to process the images on. If unset, the device is inferred from the input images."""
|
||||||
|
|
||||||
|
|
||||||
@ -253,13 +259,16 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
|||||||
rescale_factor = 1 / 255
|
rescale_factor = 1 / 255
|
||||||
do_normalize = None
|
do_normalize = None
|
||||||
do_convert_rgb = None
|
do_convert_rgb = None
|
||||||
|
return_tensors = None
|
||||||
|
data_format = ChannelDimension.FIRST
|
||||||
|
input_data_format = None
|
||||||
|
device = None
|
||||||
model_input_names = ["pixel_values"]
|
model_input_names = ["pixel_values"]
|
||||||
valid_init_kwargs = DefaultFastImageProcessorInitKwargs
|
valid_kwargs = DefaultFastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = DefaultFastImageProcessorPreprocessKwargs
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
**kwargs: Unpack[DefaultFastImageProcessorInitKwargs],
|
**kwargs: Unpack[DefaultFastImageProcessorKwargs],
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
size = kwargs.pop("size", self.size)
|
size = kwargs.pop("size", self.size)
|
||||||
@ -270,7 +279,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
|||||||
)
|
)
|
||||||
crop_size = kwargs.pop("crop_size", self.crop_size)
|
crop_size = kwargs.pop("crop_size", self.crop_size)
|
||||||
self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None
|
self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None
|
||||||
for key in self.valid_init_kwargs.__annotations__.keys():
|
for key in self.valid_kwargs.__annotations__.keys():
|
||||||
kwarg = kwargs.pop(key, None)
|
kwarg = kwargs.pop(key, None)
|
||||||
if kwarg is not None:
|
if kwarg is not None:
|
||||||
setattr(self, key, kwarg)
|
setattr(self, key, kwarg)
|
||||||
@ -553,14 +562,12 @@ class BaseImageProcessorFast(BaseImageProcessor):
|
|||||||
def preprocess(
|
def preprocess(
|
||||||
self,
|
self,
|
||||||
images: ImageInput,
|
images: ImageInput,
|
||||||
**kwargs: Unpack[DefaultFastImageProcessorPreprocessKwargs],
|
**kwargs: Unpack[DefaultFastImageProcessorKwargs],
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
validate_kwargs(
|
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
|
||||||
captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys()
|
|
||||||
)
|
|
||||||
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
||||||
# by the user, it gets its default value from the instance, or is set to None.
|
# by the user, it gets its default value from the instance, or is set to None.
|
||||||
for kwarg_name in self.valid_preprocess_kwargs.__annotations__:
|
for kwarg_name in self.valid_kwargs.__annotations__:
|
||||||
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
|
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
|
||||||
|
|
||||||
# Extract parameters that are only used for preparing the input images
|
# Extract parameters that are only used for preparing the input images
|
||||||
|
@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
|
|||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
DefaultFastImageProcessorPreprocessKwargs,
|
|
||||||
group_images_by_shape,
|
group_images_by_shape,
|
||||||
reorder_images,
|
reorder_images,
|
||||||
)
|
)
|
||||||
@ -54,11 +53,7 @@ if is_torchvision_available():
|
|||||||
from torchvision.transforms import functional as F
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
class ConvNextFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
crop_pct: Optional[float]
|
|
||||||
|
|
||||||
|
|
||||||
class ConvNextFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
|
|
||||||
crop_pct: Optional[float]
|
crop_pct: Optional[float]
|
||||||
|
|
||||||
|
|
||||||
@ -81,10 +76,9 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
|
|||||||
do_rescale = True
|
do_rescale = True
|
||||||
do_normalize = True
|
do_normalize = True
|
||||||
crop_pct = 224 / 256
|
crop_pct = 224 / 256
|
||||||
valid_init_kwargs = ConvNextFastImageProcessorInitKwargs
|
valid_kwargs = ConvNextFastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = ConvNextFastImageProcessorPreprocessKwargs
|
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorInitKwargs]):
|
def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
@ -95,9 +89,7 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
|
|||||||
overridden by `crop_pct` in the`preprocess` method.
|
overridden by `crop_pct` in the`preprocess` method.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
def preprocess(
|
def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]) -> BatchFeature:
|
||||||
self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorPreprocessKwargs]
|
|
||||||
) -> BatchFeature:
|
|
||||||
return super().preprocess(images, **kwargs)
|
return super().preprocess(images, **kwargs)
|
||||||
|
|
||||||
def resize(
|
def resize(
|
||||||
|
@ -12,8 +12,7 @@ from ...image_processing_utils_fast import (
|
|||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
DefaultFastImageProcessorPreprocessKwargs,
|
|
||||||
SizeDict,
|
SizeDict,
|
||||||
get_image_size_for_max_height_width,
|
get_image_size_for_max_height_width,
|
||||||
get_max_height_width,
|
get_max_height_width,
|
||||||
@ -58,21 +57,12 @@ elif is_torchvision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DeformableDetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
format: Optional[Union[str, AnnotationFormat]]
|
format: Optional[Union[str, AnnotationFormat]]
|
||||||
do_convert_annotations: Optional[bool]
|
do_convert_annotations: Optional[bool]
|
||||||
do_pad: Optional[bool]
|
do_pad: Optional[bool]
|
||||||
pad_size: Optional[Dict[str, int]]
|
pad_size: Optional[Dict[str, int]]
|
||||||
|
|
||||||
|
|
||||||
class DeformableDetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
|
|
||||||
format: Optional[AnnotationFormat]
|
|
||||||
annotations: Optional[Dict]
|
|
||||||
do_convert_annotations: Optional[bool]
|
|
||||||
do_pad: Optional[bool]
|
|
||||||
pad_size: Optional[Dict[str, int]]
|
|
||||||
return_segmentation_masks: Optional[bool]
|
return_segmentation_masks: Optional[bool]
|
||||||
masks_path: Optional[Union[str, pathlib.Path]]
|
|
||||||
|
|
||||||
|
|
||||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||||
@ -294,6 +284,8 @@ def prepare_coco_panoptic_annotation(
|
|||||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||||
height and width in the batch.
|
height and width in the batch.
|
||||||
|
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether to return segmentation masks.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
||||||
@ -308,10 +300,9 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
size = {"shortest_edge": 800, "longest_edge": 1333}
|
size = {"shortest_edge": 800, "longest_edge": 1333}
|
||||||
default_to_square = False
|
default_to_square = False
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
valid_init_kwargs = DeformableDetrFastImageProcessorInitKwargs
|
valid_kwargs = DeformableDetrFastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = DeformableDetrFastImageProcessorPreprocessKwargs
|
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorInitKwargs]) -> None:
|
def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None:
|
||||||
if "pad_and_return_pixel_mask" in kwargs:
|
if "pad_and_return_pixel_mask" in kwargs:
|
||||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||||
|
|
||||||
@ -605,7 +596,11 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
def preprocess(
|
def preprocess(
|
||||||
self, images: ImageInput, **kwargs: Unpack[DeformableDetrFastImageProcessorPreprocessKwargs]
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||||
|
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||||
|
**kwargs: Unpack[DeformableDetrFastImageProcessorKwargs],
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
if "pad_and_return_pixel_mask" in kwargs:
|
if "pad_and_return_pixel_mask" in kwargs:
|
||||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||||
@ -621,7 +616,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
)
|
)
|
||||||
kwargs["size"] = kwargs.pop("max_size")
|
kwargs["size"] = kwargs.pop("max_size")
|
||||||
|
|
||||||
return super().preprocess(images, **kwargs)
|
return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
|
||||||
|
|
||||||
def _preprocess(
|
def _preprocess(
|
||||||
self,
|
self,
|
||||||
|
@ -24,8 +24,7 @@ from ...image_processing_utils_fast import (
|
|||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
DefaultFastImageProcessorPreprocessKwargs,
|
|
||||||
SizeDict,
|
SizeDict,
|
||||||
get_image_size_for_max_height_width,
|
get_image_size_for_max_height_width,
|
||||||
get_max_height_width,
|
get_max_height_width,
|
||||||
@ -283,21 +282,12 @@ def prepare_coco_panoptic_annotation(
|
|||||||
return new_target
|
return new_target
|
||||||
|
|
||||||
|
|
||||||
class DetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
format: Optional[Union[str, AnnotationFormat]]
|
format: Optional[Union[str, AnnotationFormat]]
|
||||||
do_convert_annotations: Optional[bool]
|
do_convert_annotations: Optional[bool]
|
||||||
do_pad: Optional[bool]
|
do_pad: Optional[bool]
|
||||||
pad_size: Optional[Dict[str, int]]
|
pad_size: Optional[Dict[str, int]]
|
||||||
|
|
||||||
|
|
||||||
class DetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
|
|
||||||
format: Optional[AnnotationFormat]
|
|
||||||
annotations: Optional[Dict]
|
|
||||||
do_convert_annotations: Optional[bool]
|
|
||||||
do_pad: Optional[bool]
|
|
||||||
pad_size: Optional[Dict[str, int]]
|
|
||||||
return_segmentation_masks: Optional[bool]
|
return_segmentation_masks: Optional[bool]
|
||||||
masks_path: Optional[Union[str, pathlib.Path]]
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
@ -319,6 +309,8 @@ class DetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocess
|
|||||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||||
height and width in the batch.
|
height and width in the batch.
|
||||||
|
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether to return segmentation masks.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
class DetrImageProcessorFast(BaseImageProcessorFast):
|
class DetrImageProcessorFast(BaseImageProcessorFast):
|
||||||
@ -333,10 +325,9 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
size = {"shortest_edge": 800, "longest_edge": 1333}
|
size = {"shortest_edge": 800, "longest_edge": 1333}
|
||||||
default_to_square = False
|
default_to_square = False
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
valid_init_kwargs = DetrFastImageProcessorInitKwargs
|
valid_kwargs = DetrFastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = DetrFastImageProcessorPreprocessKwargs
|
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[DetrFastImageProcessorInitKwargs]) -> None:
|
def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None:
|
||||||
if "pad_and_return_pixel_mask" in kwargs:
|
if "pad_and_return_pixel_mask" in kwargs:
|
||||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||||
|
|
||||||
@ -629,7 +620,13 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
Path to the directory containing the segmentation masks.
|
Path to the directory containing the segmentation masks.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[DetrFastImageProcessorPreprocessKwargs]) -> BatchFeature:
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||||
|
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||||
|
**kwargs: Unpack[DetrFastImageProcessorKwargs],
|
||||||
|
) -> BatchFeature:
|
||||||
if "pad_and_return_pixel_mask" in kwargs:
|
if "pad_and_return_pixel_mask" in kwargs:
|
||||||
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
@ -644,7 +641,7 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
)
|
)
|
||||||
kwargs["size"] = kwargs.pop("max_size")
|
kwargs["size"] = kwargs.pop("max_size")
|
||||||
|
|
||||||
return super().preprocess(images, **kwargs)
|
return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
|
||||||
|
|
||||||
def _preprocess(
|
def _preprocess(
|
||||||
self,
|
self,
|
||||||
|
@ -24,8 +24,7 @@ from ...image_processing_utils_fast import (
|
|||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
BatchFeature,
|
BatchFeature,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
DefaultFastImageProcessorPreprocessKwargs,
|
|
||||||
get_size_dict,
|
get_size_dict,
|
||||||
group_images_by_shape,
|
group_images_by_shape,
|
||||||
reorder_images,
|
reorder_images,
|
||||||
@ -67,14 +66,7 @@ if is_torchvision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Gemma3FastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
do_pan_and_scan: Optional[bool]
|
|
||||||
pan_and_scan_min_crop_size: Optional[int]
|
|
||||||
pan_and_scan_max_num_crops: Optional[int]
|
|
||||||
pan_and_scan_min_ratio_to_activate: Optional[float]
|
|
||||||
|
|
||||||
|
|
||||||
class Gemma3FastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
|
|
||||||
do_pan_and_scan: Optional[bool]
|
do_pan_and_scan: Optional[bool]
|
||||||
pan_and_scan_min_crop_size: Optional[int]
|
pan_and_scan_min_crop_size: Optional[int]
|
||||||
pan_and_scan_max_num_crops: Optional[int]
|
pan_and_scan_max_num_crops: Optional[int]
|
||||||
@ -108,10 +100,9 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
|
|||||||
pan_and_scan_min_crop_size = None
|
pan_and_scan_min_crop_size = None
|
||||||
pan_and_scan_max_num_crops = None
|
pan_and_scan_max_num_crops = None
|
||||||
pan_and_scan_min_ratio_to_activate = None
|
pan_and_scan_min_ratio_to_activate = None
|
||||||
valid_init_kwargs = Gemma3FastImageProcessorInitKwargs
|
valid_kwargs = Gemma3FastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = Gemma3FastImageProcessorPreprocessKwargs
|
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorInitKwargs]):
|
def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
def _prepare_images_structure(
|
def _prepare_images_structure(
|
||||||
@ -262,14 +253,12 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
|
|||||||
def preprocess(
|
def preprocess(
|
||||||
self,
|
self,
|
||||||
images: ImageInput,
|
images: ImageInput,
|
||||||
**kwargs: Unpack[Gemma3FastImageProcessorPreprocessKwargs],
|
**kwargs: Unpack[Gemma3FastImageProcessorKwargs],
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
validate_kwargs(
|
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
|
||||||
captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys()
|
|
||||||
)
|
|
||||||
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
# Set default kwargs from self. This ensures that if a kwarg is not provided
|
||||||
# by the user, it gets its default value from the instance, or is set to None.
|
# by the user, it gets its default value from the instance, or is set to None.
|
||||||
for kwarg_name in self.valid_preprocess_kwargs.__annotations__:
|
for kwarg_name in self.valid_kwargs.__annotations__:
|
||||||
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
|
kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
|
||||||
|
|
||||||
# Extract parameters that are only used for preparing the input images
|
# Extract parameters that are only used for preparing the input images
|
||||||
|
@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
|
|||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
DefaultFastImageProcessorPreprocessKwargs,
|
|
||||||
group_images_by_shape,
|
group_images_by_shape,
|
||||||
reorder_images,
|
reorder_images,
|
||||||
)
|
)
|
||||||
@ -54,13 +53,7 @@ if is_torchvision_available():
|
|||||||
from torchvision.transforms import functional as F
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
class GotOcr2ImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class GotOcr2ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
crop_to_patches: Optional[bool]
|
|
||||||
min_patches: Optional[int]
|
|
||||||
max_patches: Optional[int]
|
|
||||||
|
|
||||||
|
|
||||||
class GotOcr2ImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
|
|
||||||
crop_to_patches: Optional[bool]
|
crop_to_patches: Optional[bool]
|
||||||
min_patches: Optional[int]
|
min_patches: Optional[int]
|
||||||
max_patches: Optional[int]
|
max_patches: Optional[int]
|
||||||
@ -93,10 +86,9 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
|
|||||||
crop_to_patches = False
|
crop_to_patches = False
|
||||||
min_patches = 1
|
min_patches = 1
|
||||||
max_patches = 12
|
max_patches = 12
|
||||||
valid_init_kwargs = GotOcr2ImageProcessorInitKwargs
|
valid_kwargs = GotOcr2ImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = GotOcr2ImageProcessorPreprocessKwargs
|
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[GotOcr2ImageProcessorInitKwargs]):
|
def __init__(self, **kwargs: Unpack[valid_kwargs]):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
@ -113,7 +105,7 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
|
|||||||
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2ImageProcessorPreprocessKwargs]) -> BatchFeature:
|
def preprocess(self, images: ImageInput, **kwargs: Unpack[valid_kwargs]) -> BatchFeature:
|
||||||
return super().preprocess(images, **kwargs)
|
return super().preprocess(images, **kwargs)
|
||||||
|
|
||||||
def crop_image_to_patches(
|
def crop_image_to_patches(
|
||||||
|
@ -23,8 +23,7 @@ from ...image_processing_utils_fast import (
|
|||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
DefaultFastImageProcessorPreprocessKwargs,
|
|
||||||
group_images_by_shape,
|
group_images_by_shape,
|
||||||
reorder_images,
|
reorder_images,
|
||||||
)
|
)
|
||||||
@ -61,11 +60,7 @@ if is_torchvision_available():
|
|||||||
from torchvision.transforms import functional as F
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
class LlavaFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
do_pad: Optional[bool]
|
|
||||||
|
|
||||||
|
|
||||||
class LlavaFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
|
|
||||||
do_pad: Optional[bool]
|
do_pad: Optional[bool]
|
||||||
|
|
||||||
|
|
||||||
@ -90,10 +85,9 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
|
|||||||
do_rescale = True
|
do_rescale = True
|
||||||
do_normalize = True
|
do_normalize = True
|
||||||
do_convert_rgb = True
|
do_convert_rgb = True
|
||||||
valid_init_kwargs = LlavaFastImageProcessorInitKwargs
|
valid_kwargs = LlavaFastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = LlavaFastImageProcessorPreprocessKwargs
|
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorInitKwargs]) -> None:
|
def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None:
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
@ -103,9 +97,7 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
|
|||||||
Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
|
Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
def preprocess(
|
def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature:
|
||||||
self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorPreprocessKwargs]
|
|
||||||
) -> BatchFeature:
|
|
||||||
return super().preprocess(images, **kwargs)
|
return super().preprocess(images, **kwargs)
|
||||||
|
|
||||||
def pad_to_square(
|
def pad_to_square(
|
||||||
|
@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
|
|||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
DefaultFastImageProcessorPreprocessKwargs,
|
|
||||||
divide_to_patches,
|
divide_to_patches,
|
||||||
group_images_by_shape,
|
group_images_by_shape,
|
||||||
reorder_images,
|
reorder_images,
|
||||||
@ -57,12 +56,7 @@ if is_torchvision_available():
|
|||||||
from torchvision.transforms import functional as F
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
class LlavaNextFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class LlavaNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
image_grid_pinpoints: Optional[List[List[int]]]
|
|
||||||
do_pad: Optional[bool]
|
|
||||||
|
|
||||||
|
|
||||||
class LlavaNextFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
|
|
||||||
image_grid_pinpoints: Optional[List[List[int]]]
|
image_grid_pinpoints: Optional[List[List[int]]]
|
||||||
do_pad: Optional[bool]
|
do_pad: Optional[bool]
|
||||||
|
|
||||||
@ -96,10 +90,9 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
|
|||||||
do_convert_rgb = True
|
do_convert_rgb = True
|
||||||
do_pad = True
|
do_pad = True
|
||||||
image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
|
image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
|
||||||
valid_init_kwargs = LlavaNextFastImageProcessorInitKwargs
|
valid_kwargs = LlavaNextFastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = LlavaNextFastImageProcessorPreprocessKwargs
|
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorInitKwargs]):
|
def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
@ -113,9 +106,7 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
|
|||||||
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
def preprocess(
|
def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature:
|
||||||
self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorPreprocessKwargs]
|
|
||||||
) -> BatchFeature:
|
|
||||||
return super().preprocess(images, **kwargs)
|
return super().preprocess(images, **kwargs)
|
||||||
|
|
||||||
def _prepare_images_structure(
|
def _prepare_images_structure(
|
||||||
|
@ -12,8 +12,7 @@ from ...image_processing_utils_fast import (
|
|||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
DefaultFastImageProcessorPreprocessKwargs,
|
|
||||||
divide_to_patches,
|
divide_to_patches,
|
||||||
group_images_by_shape,
|
group_images_by_shape,
|
||||||
reorder_images,
|
reorder_images,
|
||||||
@ -40,12 +39,7 @@ else:
|
|||||||
from torchvision.transforms import functional as F
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
class LlavaOnevisionFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
image_grid_pinpoints: Optional[List[List[int]]]
|
|
||||||
do_pad: Optional[bool]
|
|
||||||
|
|
||||||
|
|
||||||
class LlavaOnevisionFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
|
|
||||||
image_grid_pinpoints: Optional[List[List[int]]]
|
image_grid_pinpoints: Optional[List[List[int]]]
|
||||||
do_pad: Optional[bool]
|
do_pad: Optional[bool]
|
||||||
|
|
||||||
@ -77,11 +71,10 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
|
|||||||
do_convert_rgb = True
|
do_convert_rgb = True
|
||||||
do_pad = True
|
do_pad = True
|
||||||
image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]] # fmt: skip
|
image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]] # fmt: skip
|
||||||
valid_init_kwargs = LlavaOnevisionFastImageProcessorInitKwargs
|
valid_kwargs = LlavaOnevisionFastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = LlavaOnevisionFastImageProcessorPreprocessKwargs
|
|
||||||
model_input_names = ["pixel_values_videos"]
|
model_input_names = ["pixel_values_videos"]
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorInitKwargs]):
|
def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
@ -95,9 +88,7 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
|
|||||||
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
number of patches in the batch. Padding will be applied to the bottom and right with zeros.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
def preprocess(
|
def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature:
|
||||||
self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorPreprocessKwargs]
|
|
||||||
) -> BatchFeature:
|
|
||||||
return super().preprocess(images, **kwargs)
|
return super().preprocess(images, **kwargs)
|
||||||
|
|
||||||
def _prepare_images_structure(
|
def _prepare_images_structure(
|
||||||
|
@ -21,8 +21,7 @@ from ...image_processing_utils_fast import (
|
|||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
DefaultFastImageProcessorPreprocessKwargs,
|
|
||||||
group_images_by_shape,
|
group_images_by_shape,
|
||||||
reorder_images,
|
reorder_images,
|
||||||
)
|
)
|
||||||
@ -61,11 +60,7 @@ if is_torchvision_available():
|
|||||||
from torchvision.transforms import functional as F
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
class PixtralFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class PixtralFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
patch_size: Optional[Dict[str, int]]
|
|
||||||
|
|
||||||
|
|
||||||
class PixtralFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
|
|
||||||
patch_size: Optional[Dict[str, int]]
|
patch_size: Optional[Dict[str, int]]
|
||||||
|
|
||||||
|
|
||||||
@ -88,10 +83,9 @@ class PixtralImageProcessorFast(BaseImageProcessorFast):
|
|||||||
do_rescale = True
|
do_rescale = True
|
||||||
do_normalize = True
|
do_normalize = True
|
||||||
do_convert_rgb = True
|
do_convert_rgb = True
|
||||||
valid_init_kwargs = PixtralFastImageProcessorInitKwargs
|
valid_kwargs = PixtralFastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = PixtralFastImageProcessorPreprocessKwargs
|
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorInitKwargs]):
|
def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorKwargs]):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
@ -101,9 +95,7 @@ class PixtralImageProcessorFast(BaseImageProcessorFast):
|
|||||||
Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
|
Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
def preprocess(
|
def preprocess(self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorKwargs]) -> BatchFeature:
|
||||||
self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorPreprocessKwargs]
|
|
||||||
) -> BatchFeature:
|
|
||||||
return super().preprocess(images, **kwargs)
|
return super().preprocess(images, **kwargs)
|
||||||
|
|
||||||
def resize(
|
def resize(
|
||||||
|
@ -25,7 +25,7 @@ from ...image_processing_utils import BatchFeature
|
|||||||
from ...image_processing_utils_fast import (
|
from ...image_processing_utils_fast import (
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
group_images_by_shape,
|
group_images_by_shape,
|
||||||
reorder_images,
|
reorder_images,
|
||||||
)
|
)
|
||||||
@ -69,7 +69,7 @@ elif is_torchvision_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Qwen2VLFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class Qwen2VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
min_pixels: Optional[int]
|
min_pixels: Optional[int]
|
||||||
max_pixels: Optional[int]
|
max_pixels: Optional[int]
|
||||||
patch_size: Optional[int]
|
patch_size: Optional[int]
|
||||||
@ -107,10 +107,10 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
|
|||||||
merge_size = 2
|
merge_size = 2
|
||||||
min_pixels = 56 * 56
|
min_pixels = 56 * 56
|
||||||
max_pixels = 28 * 28 * 1280
|
max_pixels = 28 * 28 * 1280
|
||||||
valid_init_kwargs = Qwen2VLFastImageProcessorInitKwargs
|
valid_kwargs = DefaultFastImageProcessorKwargs
|
||||||
model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
|
model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorInitKwargs]):
|
def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs]):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
def _preprocess(
|
def _preprocess(
|
||||||
|
@ -12,8 +12,7 @@ from ...image_processing_utils_fast import (
|
|||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
|
||||||
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
|
||||||
BaseImageProcessorFast,
|
BaseImageProcessorFast,
|
||||||
DefaultFastImageProcessorInitKwargs,
|
DefaultFastImageProcessorKwargs,
|
||||||
DefaultFastImageProcessorPreprocessKwargs,
|
|
||||||
SizeDict,
|
SizeDict,
|
||||||
add_start_docstrings,
|
add_start_docstrings,
|
||||||
get_image_size_for_max_height_width,
|
get_image_size_for_max_height_width,
|
||||||
@ -53,21 +52,12 @@ elif is_torchvision_available():
|
|||||||
from torchvision.transforms import functional as F
|
from torchvision.transforms import functional as F
|
||||||
|
|
||||||
|
|
||||||
class RTDetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
|
class RTDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
|
||||||
format: Optional[Union[str, AnnotationFormat]]
|
format: Optional[Union[str, AnnotationFormat]]
|
||||||
do_convert_annotations: Optional[bool]
|
do_convert_annotations: Optional[bool]
|
||||||
do_pad: Optional[bool]
|
do_pad: Optional[bool]
|
||||||
pad_size: Optional[Dict[str, int]]
|
pad_size: Optional[Dict[str, int]]
|
||||||
|
|
||||||
|
|
||||||
class RTDetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
|
|
||||||
format: Optional[AnnotationFormat]
|
|
||||||
annotations: Optional[Dict]
|
|
||||||
do_convert_annotations: Optional[bool]
|
|
||||||
do_pad: Optional[bool]
|
|
||||||
pad_size: Optional[Dict[str, int]]
|
|
||||||
return_segmentation_masks: Optional[bool]
|
return_segmentation_masks: Optional[bool]
|
||||||
masks_path: Optional[Union[str, pathlib.Path]]
|
|
||||||
|
|
||||||
|
|
||||||
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
|
||||||
@ -151,6 +141,8 @@ def prepare_coco_detection_annotation(
|
|||||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||||
height and width in the batch.
|
height and width in the batch.
|
||||||
|
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether to return segmentation masks.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
||||||
@ -165,11 +157,10 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
size = {"height": 640, "width": 640}
|
size = {"height": 640, "width": 640}
|
||||||
default_to_square = False
|
default_to_square = False
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
valid_init_kwargs = RTDetrFastImageProcessorInitKwargs
|
valid_kwargs = RTDetrFastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = RTDetrFastImageProcessorPreprocessKwargs
|
|
||||||
do_convert_annotations = True
|
do_convert_annotations = True
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None:
|
def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
|
||||||
# Backwards compatibility
|
# Backwards compatibility
|
||||||
do_convert_annotations = kwargs.get("do_convert_annotations", None)
|
do_convert_annotations = kwargs.get("do_convert_annotations", None)
|
||||||
do_normalize = kwargs.get("do_normalize", None)
|
do_normalize = kwargs.get("do_normalize", None)
|
||||||
@ -424,9 +415,13 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
|
|||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
def preprocess(
|
def preprocess(
|
||||||
self, images: ImageInput, **kwargs: Unpack[RTDetrFastImageProcessorPreprocessKwargs]
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||||
|
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||||
|
**kwargs: Unpack[RTDetrFastImageProcessorKwargs],
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
return super().preprocess(images, **kwargs)
|
return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
|
||||||
|
|
||||||
def _preprocess(
|
def _preprocess(
|
||||||
self,
|
self,
|
||||||
|
@ -2,8 +2,7 @@ import pathlib
|
|||||||
from typing import Dict, List, Optional, Tuple, Union
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from transformers.models.detr.image_processing_detr_fast import (
|
from transformers.models.detr.image_processing_detr_fast import (
|
||||||
DetrFastImageProcessorInitKwargs,
|
DetrFastImageProcessorKwargs,
|
||||||
DetrFastImageProcessorPreprocessKwargs,
|
|
||||||
DetrImageProcessorFast,
|
DetrImageProcessorFast,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -112,11 +111,7 @@ def prepare_coco_detection_annotation(
|
|||||||
return new_target
|
return new_target
|
||||||
|
|
||||||
|
|
||||||
class RTDetrFastImageProcessorInitKwargs(DetrFastImageProcessorInitKwargs):
|
class RTDetrFastImageProcessorKwargs(DetrFastImageProcessorKwargs):
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class RTDetrFastImageProcessorPreprocessKwargs(DetrFastImageProcessorPreprocessKwargs):
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@ -133,10 +128,9 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
|
|||||||
size = {"height": 640, "width": 640}
|
size = {"height": 640, "width": 640}
|
||||||
default_to_square = False
|
default_to_square = False
|
||||||
model_input_names = ["pixel_values", "pixel_mask"]
|
model_input_names = ["pixel_values", "pixel_mask"]
|
||||||
valid_init_kwargs = RTDetrFastImageProcessorInitKwargs
|
valid_kwargs = RTDetrFastImageProcessorKwargs
|
||||||
valid_preprocess_kwargs = RTDetrFastImageProcessorPreprocessKwargs
|
|
||||||
|
|
||||||
def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None:
|
def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
|
||||||
# Backwards compatibility
|
# Backwards compatibility
|
||||||
do_convert_annotations = kwargs.get("do_convert_annotations", None)
|
do_convert_annotations = kwargs.get("do_convert_annotations", None)
|
||||||
do_normalize = kwargs.get("do_normalize", None)
|
do_normalize = kwargs.get("do_normalize", None)
|
||||||
@ -181,9 +175,13 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
|
|||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
def preprocess(
|
def preprocess(
|
||||||
self, images: ImageInput, **kwargs: Unpack[RTDetrFastImageProcessorPreprocessKwargs]
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
|
||||||
|
masks_path: Optional[Union[str, pathlib.Path]] = None,
|
||||||
|
**kwargs: Unpack[RTDetrFastImageProcessorKwargs],
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
return BaseImageProcessorFast().preprocess(images, **kwargs)
|
return BaseImageProcessorFast().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
|
||||||
|
|
||||||
def prepare_annotation(
|
def prepare_annotation(
|
||||||
self,
|
self,
|
||||||
|
@ -311,8 +311,10 @@ class ImageProcessingTestMixin:
|
|||||||
}
|
}
|
||||||
dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)}
|
dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)}
|
||||||
dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)}
|
dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)}
|
||||||
# check that all additional keys are None, except for `default_to_square` which is only set in fast processors
|
# check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
|
||||||
self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
|
self.assertTrue(
|
||||||
|
all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
|
||||||
|
)
|
||||||
# check that the remaining keys are the same
|
# check that the remaining keys are the same
|
||||||
self.assertEqual(dict_slow_0, dict_slow_1)
|
self.assertEqual(dict_slow_0, dict_slow_1)
|
||||||
|
|
||||||
@ -324,8 +326,10 @@ class ImageProcessingTestMixin:
|
|||||||
}
|
}
|
||||||
dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)}
|
dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)}
|
||||||
dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)}
|
dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)}
|
||||||
# check that all additional keys are None, except for `default_to_square` which is only set in fast processors
|
# check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
|
||||||
self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
|
self.assertTrue(
|
||||||
|
all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
|
||||||
|
)
|
||||||
# check that the remaining keys are the same
|
# check that the remaining keys are the same
|
||||||
self.assertEqual(dict_fast_0, dict_fast_1)
|
self.assertEqual(dict_fast_0, dict_fast_1)
|
||||||
|
|
||||||
@ -357,8 +361,10 @@ class ImageProcessingTestMixin:
|
|||||||
}
|
}
|
||||||
dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)}
|
dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)}
|
||||||
dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)}
|
dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)}
|
||||||
# check that all additional keys are None, except for `default_to_square` which is only set in fast processors
|
# check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
|
||||||
self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
|
self.assertTrue(
|
||||||
|
all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
|
||||||
|
)
|
||||||
# check that the remaining keys are the same
|
# check that the remaining keys are the same
|
||||||
self.assertEqual(dict_slow_0, dict_slow_1)
|
self.assertEqual(dict_slow_0, dict_slow_1)
|
||||||
|
|
||||||
@ -370,8 +376,10 @@ class ImageProcessingTestMixin:
|
|||||||
}
|
}
|
||||||
dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)}
|
dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)}
|
||||||
dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)}
|
dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)}
|
||||||
# check that all additional keys are None, except for `default_to_square` which is only set in fast processors
|
# check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
|
||||||
self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
|
self.assertTrue(
|
||||||
|
all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
|
||||||
|
)
|
||||||
# check that the remaining keys are the same
|
# check that the remaining keys are the same
|
||||||
self.assertEqual(dict_fast_0, dict_fast_1)
|
self.assertEqual(dict_fast_0, dict_fast_1)
|
||||||
|
|
||||||
|
@ -1087,8 +1087,7 @@ TYPE_TO_FILE_TYPE = {
|
|||||||
"Processor": "processing",
|
"Processor": "processing",
|
||||||
"ImageProcessor": "image_processing",
|
"ImageProcessor": "image_processing",
|
||||||
"ImageProcessorFast": "image_processing*_fast", # "*" indicates where to insert the model name before the "_fast" suffix
|
"ImageProcessorFast": "image_processing*_fast", # "*" indicates where to insert the model name before the "_fast" suffix
|
||||||
"FastImageProcessorInitKwargs": "image_processing*_fast",
|
"FastImageProcessorKwargs": "image_processing*_fast",
|
||||||
"FastImageProcessorPreprocessKwargs": "image_processing*_fast",
|
|
||||||
"FeatureExtractor": "feature_extractor",
|
"FeatureExtractor": "feature_extractor",
|
||||||
"ProcessorKwargs": "processing",
|
"ProcessorKwargs": "processing",
|
||||||
"ImagesKwargs": "processing",
|
"ImagesKwargs": "processing",
|
||||||
|
Loading…
Reference in New Issue
Block a user