Add fixed resize and pad strategy for object detection (#30742)

* Add resize and pad strategy

* Merge get_size functions

* Add pad_size + tests to object detection models

* Fixup

* Update docstrings

* Fixup
This commit is contained in:
Pavel Iakubovskii 2024-05-17 16:21:26 +01:00 committed by GitHub
parent e9a8041d1c
commit bf646fbf2d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 929 additions and 89 deletions

View File

@ -662,7 +662,13 @@ class BaseImageProcessor(ImageProcessingMixin):
)
VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"}, {"longest_edge"})
VALID_SIZE_DICT_KEYS = (
{"height", "width"},
{"shortest_edge"},
{"shortest_edge", "longest_edge"},
{"longest_edge"},
{"max_height", "max_width"},
)
def is_valid_size_dict(size_dict):

View File

@ -147,6 +147,42 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
"""
@ -768,8 +804,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
@ -793,8 +837,13 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
model_input_names = ["pixel_values", "pixel_mask"]
@ -813,6 +862,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
@ -846,6 +896,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [
"images",
"annotations",
@ -861,6 +912,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
"image_mean",
"image_std",
"do_pad",
"pad_size",
"format",
"return_tensors",
"data_format",
@ -933,8 +985,15 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
@ -953,18 +1012,27 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
new_size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return image
@ -1108,6 +1176,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
@ -1137,8 +1206,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
@ -1146,7 +1223,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
padded_size,
annotation,
constant_values=constant_values,
data_format=data_format,
@ -1160,7 +1237,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
if return_pixel_mask:
masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images
]
data["pixel_mask"] = masks
@ -1195,6 +1272,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> BatchFeature:
"""
@ -1222,7 +1300,15 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
@ -1240,8 +1326,9 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@ -1257,6 +1344,10 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once(
@ -1286,6 +1377,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format
images = make_list_of_images(images)
@ -1410,6 +1502,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
input_data_format=input_data_format,
update_bboxes=do_convert_annotations,
return_tensors=return_tensors,
pad_size=pad_size,
)
else:
images = [

View File

@ -145,6 +145,42 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
"""
@ -766,8 +802,16 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
@ -791,8 +835,13 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
model_input_names = ["pixel_values", "pixel_mask"]
@ -811,6 +860,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
@ -844,6 +894,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [
"images",
"annotations",
@ -859,6 +910,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
"image_mean",
"image_std",
"do_pad",
"pad_size",
"format",
"return_tensors",
"data_format",
@ -931,8 +983,15 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
@ -951,18 +1010,27 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
new_size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return image
@ -1106,6 +1174,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
@ -1135,8 +1204,16 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
@ -1144,7 +1221,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
padded_size,
annotation,
constant_values=constant_values,
data_format=data_format,
@ -1158,7 +1235,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
if return_pixel_mask:
masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images
]
data["pixel_mask"] = masks
@ -1193,6 +1270,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> BatchFeature:
"""
@ -1220,7 +1298,15 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
@ -1238,8 +1324,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@ -1255,6 +1342,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once(
@ -1284,6 +1375,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format
images = make_list_of_images(images)
@ -1408,6 +1500,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
input_data_format=input_data_format,
update_bboxes=do_convert_annotations,
return_tensors=return_tensors,
pad_size=pad_size,
)
else:
images = [

View File

@ -139,6 +139,42 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
"""
@ -475,8 +511,16 @@ class DetaImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
@ -500,8 +544,13 @@ class DetaImageProcessor(BaseImageProcessor):
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
model_input_names = ["pixel_values", "pixel_mask"]
@ -519,6 +568,7 @@ class DetaImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None,
do_convert_annotations: bool = True,
do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
@ -542,6 +592,7 @@ class DetaImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
self.pad_size = pad_size
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DETA
def prepare_annotation(
@ -593,7 +644,15 @@ class DetaImageProcessor(BaseImageProcessor):
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
The desired output size. Can contain keys `shortest_edge` and `longest_edge` or `height` and `width`.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`ChannelDimension`, *optional*):
@ -605,18 +664,22 @@ class DetaImageProcessor(BaseImageProcessor):
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
new_size = (size["height"], size["width"])
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format
image, size=new_size, resample=resample, data_format=data_format, input_data_format=input_data_format
)
return image
@ -760,6 +823,7 @@ class DetaImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
@ -789,8 +853,16 @@ class DetaImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
@ -798,7 +870,7 @@ class DetaImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
padded_size,
annotation,
constant_values=constant_values,
data_format=data_format,
@ -812,7 +884,7 @@ class DetaImageProcessor(BaseImageProcessor):
if return_pixel_mask:
masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images
]
data["pixel_mask"] = masks
@ -846,6 +918,7 @@ class DetaImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> BatchFeature:
"""
@ -873,7 +946,15 @@ class DetaImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
@ -891,8 +972,9 @@ class DetaImageProcessor(BaseImageProcessor):
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@ -908,6 +990,10 @@ class DetaImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once(
@ -929,6 +1015,7 @@ class DetaImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format
# Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
@ -1051,6 +1138,7 @@ class DetaImageProcessor(BaseImageProcessor):
input_data_format=input_data_format,
return_tensors=return_tensors,
update_bboxes=do_convert_annotations,
pad_size=pad_size,
)
else:
images = [

View File

@ -116,6 +116,41 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
return (oh, ow)
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
def get_resize_output_image_size(
input_image: np.ndarray,
size: Union[int, Tuple[int, int], List[int]],
@ -753,7 +788,15 @@ class DetrImageProcessor(BaseImageProcessor):
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
in the `preprocess` method.
in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
@ -777,8 +820,13 @@ class DetrImageProcessor(BaseImageProcessor):
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
model_input_names = ["pixel_values", "pixel_mask"]
@ -796,6 +844,7 @@ class DetrImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
@ -829,6 +878,7 @@ class DetrImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [
"images",
"annotations",
@ -844,6 +894,7 @@ class DetrImageProcessor(BaseImageProcessor):
"image_mean",
"image_std",
"do_pad",
"pad_size",
"format",
"return_tensors",
"data_format",
@ -913,8 +964,15 @@ class DetrImageProcessor(BaseImageProcessor):
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
@ -933,18 +991,27 @@ class DetrImageProcessor(BaseImageProcessor):
max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
new_size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return image
@ -1083,6 +1150,7 @@ class DetrImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
@ -1112,8 +1180,16 @@ class DetrImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
@ -1121,7 +1197,7 @@ class DetrImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
padded_size,
annotation,
constant_values=constant_values,
data_format=data_format,
@ -1135,7 +1211,7 @@ class DetrImageProcessor(BaseImageProcessor):
if return_pixel_mask:
masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images
]
data["pixel_mask"] = masks
@ -1169,6 +1245,7 @@ class DetrImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> BatchFeature:
"""
@ -1196,7 +1273,15 @@ class DetrImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
@ -1214,8 +1299,9 @@ class DetrImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@ -1231,6 +1317,10 @@ class DetrImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once(
@ -1260,6 +1350,7 @@ class DetrImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format
images = make_list_of_images(images)
@ -1384,6 +1475,7 @@ class DetrImageProcessor(BaseImageProcessor):
input_data_format=input_data_format,
update_bboxes=do_convert_annotations,
return_tensors=return_tensors,
pad_size=pad_size,
)
else:
images = [

View File

@ -152,6 +152,42 @@ def get_resize_output_image_size(
return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
"""
@ -773,8 +809,16 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
@ -798,8 +842,14 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
overridden by the `do_pad` parameter in the `preprocess` method.
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
model_input_names = ["pixel_values", "pixel_mask"]
@ -818,6 +868,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
@ -851,6 +902,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [
"images",
"annotations",
@ -866,6 +918,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
"image_mean",
"image_std",
"do_pad",
"pad_size",
"format",
"return_tensors",
"data_format",
@ -938,8 +991,15 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
@ -958,18 +1018,27 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
new_size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return image
@ -1113,6 +1182,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
@ -1142,8 +1212,16 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
@ -1151,7 +1229,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
padded_size,
annotation,
constant_values=constant_values,
data_format=data_format,
@ -1165,7 +1243,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
if return_pixel_mask:
masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images
]
data["pixel_mask"] = masks
@ -1200,6 +1278,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> BatchFeature:
"""
@ -1227,7 +1306,15 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
@ -1245,8 +1332,9 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@ -1262,6 +1350,10 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once(
@ -1291,6 +1383,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format
images = make_list_of_images(images)
@ -1415,6 +1508,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
input_data_format=input_data_format,
update_bboxes=do_convert_annotations,
return_tensors=return_tensors,
pad_size=pad_size,
)
else:
images = [

View File

@ -133,6 +133,42 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
return (height, width)
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
def get_image_size_for_max_height_width(
input_image: np.ndarray,
max_height: int,
max_width: int,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
"""
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
Important, even if image_height < max_height and image_width < max_width, the image will be resized
to at least one of the edges be equal to max_height or max_width.
For example:
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
Args:
input_image (`np.ndarray`):
The image to resize.
max_height (`int`):
The maximum allowed height.
max_width (`int`):
The maximum allowed width.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
"""
image_size = get_image_size(input_image, input_data_format)
height, width = image_size
height_scale = max_height / height
width_scale = max_width / width
min_scale = min(height_scale, width_scale)
new_height = int(height * min_scale)
new_width = int(width * min_scale)
return new_height, new_width
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size(
input_image: np.ndarray,
@ -678,8 +714,16 @@ class YolosImageProcessor(BaseImageProcessor):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
in the `preprocess` method. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
@ -699,8 +743,13 @@ class YolosImageProcessor(BaseImageProcessor):
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True` will pad the images in the batch to the largest height and width in the batch.
Padding will be applied to the bottom and right of the image with zeros.
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
model_input_names = ["pixel_values", "pixel_mask"]
@ -718,6 +767,7 @@ class YolosImageProcessor(BaseImageProcessor):
image_std: Union[float, List[float]] = None,
do_convert_annotations: Optional[bool] = None,
do_pad: bool = True,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
@ -751,6 +801,7 @@ class YolosImageProcessor(BaseImageProcessor):
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
self.pad_size = pad_size
self._valid_processor_keys = [
"images",
"annotations",
@ -766,6 +817,7 @@ class YolosImageProcessor(BaseImageProcessor):
"image_std",
"do_convert_annotations",
"do_pad",
"pad_size",
"format",
"return_tensors",
"data_format",
@ -838,8 +890,15 @@ class YolosImageProcessor(BaseImageProcessor):
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
`height` and `width`.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
data_format (`str` or `ChannelDimension`, *optional*):
@ -858,18 +917,27 @@ class YolosImageProcessor(BaseImageProcessor):
max_size = None
size = get_size_dict(size, max_size=max_size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(
new_size = get_resize_output_image_size(
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
)
elif "max_height" in size and "max_width" in size:
new_size = get_image_size_for_max_height_width(
image, size["max_height"], size["max_width"], input_data_format=input_data_format
)
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
new_size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
image,
size=new_size,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
**kwargs,
)
return image
@ -1012,6 +1080,7 @@ class YolosImageProcessor(BaseImageProcessor):
data_format: Optional[ChannelDimension] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
update_bboxes: bool = True,
pad_size: Optional[Dict[str, int]] = None,
) -> BatchFeature:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
@ -1042,8 +1111,16 @@ class YolosImageProcessor(BaseImageProcessor):
Whether to update the bounding boxes in the annotations to match the padded images. If the
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
format, the bounding boxes will not be updated.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
pad_size = get_max_height_width(images, input_data_format=input_data_format)
pad_size = pad_size if pad_size is not None else self.pad_size
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images, input_data_format=input_data_format)
annotation_list = annotations if annotations is not None else [None] * len(images)
padded_images = []
@ -1051,7 +1128,7 @@ class YolosImageProcessor(BaseImageProcessor):
for image, annotation in zip(images, annotation_list):
padded_image, padded_annotation = self._pad_image(
image,
pad_size,
padded_size,
annotation,
constant_values=constant_values,
data_format=data_format,
@ -1065,7 +1142,7 @@ class YolosImageProcessor(BaseImageProcessor):
if return_pixel_mask:
masks = [
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
for image in images
]
data["pixel_mask"] = masks
@ -1099,6 +1176,7 @@ class YolosImageProcessor(BaseImageProcessor):
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
pad_size: Optional[Dict[str, int]] = None,
**kwargs,
) -> BatchFeature:
"""
@ -1126,7 +1204,15 @@ class YolosImageProcessor(BaseImageProcessor):
do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing.
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
@ -1144,8 +1230,9 @@ class YolosImageProcessor(BaseImageProcessor):
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
and in relative coordinates.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
@ -1158,6 +1245,10 @@ class YolosImageProcessor(BaseImageProcessor):
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
"""
if "pad_and_return_pixel_mask" in kwargs:
logger.warning_once(
@ -1187,6 +1278,7 @@ class YolosImageProcessor(BaseImageProcessor):
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
)
do_pad = self.do_pad if do_pad is None else do_pad
pad_size = self.pad_size if pad_size is None else pad_size
format = self.format if format is None else format
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
@ -1310,6 +1402,7 @@ class YolosImageProcessor(BaseImageProcessor):
input_data_format=input_data_format,
update_bboxes=do_convert_annotations,
return_tensors=return_tensors,
pad_size=pad_size,
)
else:
images = [

View File

@ -490,3 +490,50 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = ConditionalDetrImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = ConditionalDetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = ConditionalDetrImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = ConditionalDetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = ConditionalDetrImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))

View File

@ -492,3 +492,50 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = DeformableDetrImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = DeformableDetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = DeformableDetrImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = DeformableDetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = DeformableDetrImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))

View File

@ -486,3 +486,50 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Deta
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = DetaImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = DetaImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = DetaImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = DetaImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = DetaImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))

View File

@ -547,3 +547,49 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = DetrImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = DetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = DetrImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = DetrImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = DetrImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))

View File

@ -528,3 +528,50 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = GroundingDinoImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = GroundingDinoImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = GroundingDinoImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = GroundingDinoImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = GroundingDinoImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))

View File

@ -546,3 +546,50 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
).T
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos
def test_max_width_max_height_resizing_and_pad_strategy(self):
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
image_processor = YolosImageProcessor(
size={"max_height": 100, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
image_processor = YolosImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=False,
)
inputs = image_processor(images=[image_1], return_tensors="pt")
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
image_processor = YolosImageProcessor(
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
image_processor = YolosImageProcessor(
size={"max_height": 300, "max_width": 100},
do_pad=True,
pad_size={"height": 301, "width": 101},
)
inputs = image_processor(images=[image_1], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
### Check for batch
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
image_processor = YolosImageProcessor(
size={"max_height": 150, "max_width": 100},
do_pad=True,
pad_size={"height": 150, "width": 100},
)
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))