mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Add fixed resize and pad strategy for object detection (#30742)
* Add resize and pad strategy * Merge get_size functions * Add pad_size + tests to object detection models * Fixup * Update docstrings * Fixup
This commit is contained in:
parent
e9a8041d1c
commit
bf646fbf2d
@ -662,7 +662,13 @@ class BaseImageProcessor(ImageProcessingMixin):
|
||||
)
|
||||
|
||||
|
||||
VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"}, {"longest_edge"})
|
||||
VALID_SIZE_DICT_KEYS = (
|
||||
{"height", "width"},
|
||||
{"shortest_edge"},
|
||||
{"shortest_edge", "longest_edge"},
|
||||
{"longest_edge"},
|
||||
{"max_height", "max_width"},
|
||||
)
|
||||
|
||||
|
||||
def is_valid_size_dict(size_dict):
|
||||
|
@ -147,6 +147,42 @@ def get_resize_output_image_size(
|
||||
return get_size_with_aspect_ratio(image_size, size, max_size)
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
|
||||
def get_image_size_for_max_height_width(
|
||||
input_image: np.ndarray,
|
||||
max_height: int,
|
||||
max_width: int,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
|
||||
Important, even if image_height < max_height and image_width < max_width, the image will be resized
|
||||
to at least one of the edges be equal to max_height or max_width.
|
||||
|
||||
For example:
|
||||
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
|
||||
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
|
||||
|
||||
Args:
|
||||
input_image (`np.ndarray`):
|
||||
The image to resize.
|
||||
max_height (`int`):
|
||||
The maximum allowed height.
|
||||
max_width (`int`):
|
||||
The maximum allowed width.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
|
||||
"""
|
||||
image_size = get_image_size(input_image, input_data_format)
|
||||
height, width = image_size
|
||||
height_scale = max_height / height
|
||||
width_scale = max_width / width
|
||||
min_scale = min(height_scale, width_scale)
|
||||
new_height = int(height * min_scale)
|
||||
new_width = int(width * min_scale)
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
|
||||
def get_numpy_to_framework_fn(arr) -> Callable:
|
||||
"""
|
||||
@ -768,8 +804,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
|
||||
overridden by the `do_resize` parameter in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
||||
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
|
||||
the `preprocess` method.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
||||
in the `preprocess` method. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
@ -793,8 +837,13 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||
Padding will be applied to the bottom and right of the image with zeros.
|
||||
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
|
||||
If `pad_size` is provided, the image will be padded to the specified dimensions.
|
||||
Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@ -813,6 +862,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
@ -846,6 +896,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
self.pad_size = pad_size
|
||||
self._valid_processor_keys = [
|
||||
"images",
|
||||
"annotations",
|
||||
@ -861,6 +912,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
"image_mean",
|
||||
"image_std",
|
||||
"do_pad",
|
||||
"pad_size",
|
||||
"format",
|
||||
"return_tensors",
|
||||
"data_format",
|
||||
@ -933,8 +985,15 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`Dict[str, int]`):
|
||||
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
|
||||
`height` and `width`.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
@ -953,18 +1012,27 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
max_size = None
|
||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||
if "shortest_edge" in size and "longest_edge" in size:
|
||||
size = get_resize_output_image_size(
|
||||
new_size = get_resize_output_image_size(
|
||||
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
|
||||
)
|
||||
elif "max_height" in size and "max_width" in size:
|
||||
new_size = get_image_size_for_max_height_width(
|
||||
image, size["max_height"], size["max_width"], input_data_format=input_data_format
|
||||
)
|
||||
elif "height" in size and "width" in size:
|
||||
size = (size["height"], size["width"])
|
||||
new_size = (size["height"], size["width"])
|
||||
else:
|
||||
raise ValueError(
|
||||
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
|
||||
f" {size.keys()}."
|
||||
)
|
||||
image = resize(
|
||||
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
|
||||
image,
|
||||
size=new_size,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
**kwargs,
|
||||
)
|
||||
return image
|
||||
|
||||
@ -1108,6 +1176,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
@ -1137,8 +1206,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
pad_size = pad_size if pad_size is not None else self.pad_size
|
||||
if pad_size is not None:
|
||||
padded_size = (pad_size["height"], pad_size["width"])
|
||||
else:
|
||||
padded_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
@ -1146,7 +1223,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
padded_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
@ -1160,7 +1237,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
|
||||
if return_pixel_mask:
|
||||
masks = [
|
||||
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
|
||||
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data["pixel_mask"] = masks
|
||||
@ -1195,6 +1272,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@ -1222,7 +1300,15 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
do_resize (`bool`, *optional*, defaults to self.do_resize):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to self.size):
|
||||
Size of the image after resizing.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to self.resample):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
|
||||
@ -1240,8 +1326,9 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||
Standard deviation to use when normalizing the image.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
|
||||
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
|
||||
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@ -1257,6 +1344,10 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
logger.warning_once(
|
||||
@ -1286,6 +1377,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
pad_size = self.pad_size if pad_size is None else pad_size
|
||||
format = self.format if format is None else format
|
||||
|
||||
images = make_list_of_images(images)
|
||||
@ -1410,6 +1502,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=do_convert_annotations,
|
||||
return_tensors=return_tensors,
|
||||
pad_size=pad_size,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
|
@ -145,6 +145,42 @@ def get_resize_output_image_size(
|
||||
return get_size_with_aspect_ratio(image_size, size, max_size)
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
|
||||
def get_image_size_for_max_height_width(
|
||||
input_image: np.ndarray,
|
||||
max_height: int,
|
||||
max_width: int,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
|
||||
Important, even if image_height < max_height and image_width < max_width, the image will be resized
|
||||
to at least one of the edges be equal to max_height or max_width.
|
||||
|
||||
For example:
|
||||
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
|
||||
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
|
||||
|
||||
Args:
|
||||
input_image (`np.ndarray`):
|
||||
The image to resize.
|
||||
max_height (`int`):
|
||||
The maximum allowed height.
|
||||
max_width (`int`):
|
||||
The maximum allowed width.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
|
||||
"""
|
||||
image_size = get_image_size(input_image, input_data_format)
|
||||
height, width = image_size
|
||||
height_scale = max_height / height
|
||||
width_scale = max_width / width
|
||||
min_scale = min(height_scale, width_scale)
|
||||
new_height = int(height * min_scale)
|
||||
new_width = int(width * min_scale)
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
|
||||
def get_numpy_to_framework_fn(arr) -> Callable:
|
||||
"""
|
||||
@ -766,8 +802,16 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
|
||||
overridden by the `do_resize` parameter in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
||||
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
|
||||
the `preprocess` method.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
||||
in the `preprocess` method. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
@ -791,8 +835,13 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||
Padding will be applied to the bottom and right of the image with zeros.
|
||||
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
|
||||
If `pad_size` is provided, the image will be padded to the specified dimensions.
|
||||
Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@ -811,6 +860,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
@ -844,6 +894,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
self.pad_size = pad_size
|
||||
self._valid_processor_keys = [
|
||||
"images",
|
||||
"annotations",
|
||||
@ -859,6 +910,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
"image_mean",
|
||||
"image_std",
|
||||
"do_pad",
|
||||
"pad_size",
|
||||
"format",
|
||||
"return_tensors",
|
||||
"data_format",
|
||||
@ -931,8 +983,15 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`Dict[str, int]`):
|
||||
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
|
||||
`height` and `width`.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
@ -951,18 +1010,27 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
max_size = None
|
||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||
if "shortest_edge" in size and "longest_edge" in size:
|
||||
size = get_resize_output_image_size(
|
||||
new_size = get_resize_output_image_size(
|
||||
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
|
||||
)
|
||||
elif "max_height" in size and "max_width" in size:
|
||||
new_size = get_image_size_for_max_height_width(
|
||||
image, size["max_height"], size["max_width"], input_data_format=input_data_format
|
||||
)
|
||||
elif "height" in size and "width" in size:
|
||||
size = (size["height"], size["width"])
|
||||
new_size = (size["height"], size["width"])
|
||||
else:
|
||||
raise ValueError(
|
||||
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
|
||||
f" {size.keys()}."
|
||||
)
|
||||
image = resize(
|
||||
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
|
||||
image,
|
||||
size=new_size,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
**kwargs,
|
||||
)
|
||||
return image
|
||||
|
||||
@ -1106,6 +1174,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
@ -1135,8 +1204,16 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
pad_size = pad_size if pad_size is not None else self.pad_size
|
||||
if pad_size is not None:
|
||||
padded_size = (pad_size["height"], pad_size["width"])
|
||||
else:
|
||||
padded_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
@ -1144,7 +1221,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
padded_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
@ -1158,7 +1235,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
|
||||
if return_pixel_mask:
|
||||
masks = [
|
||||
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
|
||||
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data["pixel_mask"] = masks
|
||||
@ -1193,6 +1270,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@ -1220,7 +1298,15 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
do_resize (`bool`, *optional*, defaults to self.do_resize):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to self.size):
|
||||
Size of the image after resizing.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to self.resample):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
|
||||
@ -1238,8 +1324,9 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||
Standard deviation to use when normalizing the image.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
|
||||
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
|
||||
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@ -1255,6 +1342,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
logger.warning_once(
|
||||
@ -1284,6 +1375,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
pad_size = self.pad_size if pad_size is None else pad_size
|
||||
format = self.format if format is None else format
|
||||
|
||||
images = make_list_of_images(images)
|
||||
@ -1408,6 +1500,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=do_convert_annotations,
|
||||
return_tensors=return_tensors,
|
||||
pad_size=pad_size,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
|
@ -139,6 +139,42 @@ def get_resize_output_image_size(
|
||||
return get_size_with_aspect_ratio(image_size, size, max_size)
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
|
||||
def get_image_size_for_max_height_width(
|
||||
input_image: np.ndarray,
|
||||
max_height: int,
|
||||
max_width: int,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
|
||||
Important, even if image_height < max_height and image_width < max_width, the image will be resized
|
||||
to at least one of the edges be equal to max_height or max_width.
|
||||
|
||||
For example:
|
||||
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
|
||||
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
|
||||
|
||||
Args:
|
||||
input_image (`np.ndarray`):
|
||||
The image to resize.
|
||||
max_height (`int`):
|
||||
The maximum allowed height.
|
||||
max_width (`int`):
|
||||
The maximum allowed width.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
|
||||
"""
|
||||
image_size = get_image_size(input_image, input_data_format)
|
||||
height, width = image_size
|
||||
height_scale = max_height / height
|
||||
width_scale = max_width / width
|
||||
min_scale = min(height_scale, width_scale)
|
||||
new_height = int(height * min_scale)
|
||||
new_width = int(width * min_scale)
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
|
||||
def get_numpy_to_framework_fn(arr) -> Callable:
|
||||
"""
|
||||
@ -475,8 +511,16 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
|
||||
overridden by the `do_resize` parameter in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
||||
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
|
||||
the `preprocess` method.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
||||
in the `preprocess` method. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
@ -500,8 +544,13 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||
Padding will be applied to the bottom and right of the image with zeros.
|
||||
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
|
||||
If `pad_size` is provided, the image will be padded to the specified dimensions.
|
||||
Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@ -519,6 +568,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: bool = True,
|
||||
do_pad: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
@ -542,6 +592,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
self.pad_size = pad_size
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DETA
|
||||
def prepare_annotation(
|
||||
@ -593,7 +644,15 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`Dict[str, int]`):
|
||||
The desired output size. Can contain keys `shortest_edge` and `longest_edge` or `height` and `width`.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
data_format (`ChannelDimension`, *optional*):
|
||||
@ -605,18 +664,22 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
size = get_size_dict(size, default_to_square=False)
|
||||
if "shortest_edge" in size and "longest_edge" in size:
|
||||
size = get_resize_output_image_size(
|
||||
new_size = get_resize_output_image_size(
|
||||
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
|
||||
)
|
||||
elif "height" in size and "width" in size:
|
||||
size = (size["height"], size["width"])
|
||||
new_size = (size["height"], size["width"])
|
||||
elif "max_height" in size and "max_width" in size:
|
||||
new_size = get_image_size_for_max_height_width(
|
||||
image, size["max_height"], size["max_width"], input_data_format=input_data_format
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
|
||||
f" {size.keys()}."
|
||||
)
|
||||
image = resize(
|
||||
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format
|
||||
image, size=new_size, resample=resample, data_format=data_format, input_data_format=input_data_format
|
||||
)
|
||||
return image
|
||||
|
||||
@ -760,6 +823,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
@ -789,8 +853,16 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
pad_size = pad_size if pad_size is not None else self.pad_size
|
||||
if pad_size is not None:
|
||||
padded_size = (pad_size["height"], pad_size["width"])
|
||||
else:
|
||||
padded_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
@ -798,7 +870,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
padded_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
@ -812,7 +884,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
|
||||
if return_pixel_mask:
|
||||
masks = [
|
||||
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
|
||||
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data["pixel_mask"] = masks
|
||||
@ -846,6 +918,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@ -873,7 +946,15 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
do_resize (`bool`, *optional*, defaults to self.do_resize):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to self.size):
|
||||
Size of the image after resizing.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to self.resample):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
|
||||
@ -891,8 +972,9 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||
and in relative coordinates.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
|
||||
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
|
||||
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@ -908,6 +990,10 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
logger.warning_once(
|
||||
@ -929,6 +1015,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
pad_size = self.pad_size if pad_size is None else pad_size
|
||||
format = self.format if format is None else format
|
||||
|
||||
# Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
|
||||
@ -1051,6 +1138,7 @@ class DetaImageProcessor(BaseImageProcessor):
|
||||
input_data_format=input_data_format,
|
||||
return_tensors=return_tensors,
|
||||
update_bboxes=do_convert_annotations,
|
||||
pad_size=pad_size,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
|
@ -116,6 +116,41 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
|
||||
return (oh, ow)
|
||||
|
||||
|
||||
def get_image_size_for_max_height_width(
|
||||
input_image: np.ndarray,
|
||||
max_height: int,
|
||||
max_width: int,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
|
||||
Important, even if image_height < max_height and image_width < max_width, the image will be resized
|
||||
to at least one of the edges be equal to max_height or max_width.
|
||||
|
||||
For example:
|
||||
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
|
||||
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
|
||||
|
||||
Args:
|
||||
input_image (`np.ndarray`):
|
||||
The image to resize.
|
||||
max_height (`int`):
|
||||
The maximum allowed height.
|
||||
max_width (`int`):
|
||||
The maximum allowed width.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
|
||||
"""
|
||||
image_size = get_image_size(input_image, input_data_format)
|
||||
height, width = image_size
|
||||
height_scale = max_height / height
|
||||
width_scale = max_width / width
|
||||
min_scale = min(height_scale, width_scale)
|
||||
new_height = int(height * min_scale)
|
||||
new_width = int(width * min_scale)
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
def get_resize_output_image_size(
|
||||
input_image: np.ndarray,
|
||||
size: Union[int, Tuple[int, int], List[int]],
|
||||
@ -753,7 +788,15 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
overridden by the `do_resize` parameter in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
||||
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
||||
in the `preprocess` method.
|
||||
in the `preprocess` method. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
@ -777,8 +820,13 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||
Padding will be applied to the bottom and right of the image with zeros.
|
||||
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
|
||||
If `pad_size` is provided, the image will be padded to the specified dimensions.
|
||||
Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@ -796,6 +844,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
@ -829,6 +878,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
self.pad_size = pad_size
|
||||
self._valid_processor_keys = [
|
||||
"images",
|
||||
"annotations",
|
||||
@ -844,6 +894,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
"image_mean",
|
||||
"image_std",
|
||||
"do_pad",
|
||||
"pad_size",
|
||||
"format",
|
||||
"return_tensors",
|
||||
"data_format",
|
||||
@ -913,8 +964,15 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`Dict[str, int]`):
|
||||
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
|
||||
`height` and `width`.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
@ -933,18 +991,27 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
max_size = None
|
||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||
if "shortest_edge" in size and "longest_edge" in size:
|
||||
size = get_resize_output_image_size(
|
||||
new_size = get_resize_output_image_size(
|
||||
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
|
||||
)
|
||||
elif "max_height" in size and "max_width" in size:
|
||||
new_size = get_image_size_for_max_height_width(
|
||||
image, size["max_height"], size["max_width"], input_data_format=input_data_format
|
||||
)
|
||||
elif "height" in size and "width" in size:
|
||||
size = (size["height"], size["width"])
|
||||
new_size = (size["height"], size["width"])
|
||||
else:
|
||||
raise ValueError(
|
||||
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
|
||||
f" {size.keys()}."
|
||||
)
|
||||
image = resize(
|
||||
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
|
||||
image,
|
||||
size=new_size,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
**kwargs,
|
||||
)
|
||||
return image
|
||||
|
||||
@ -1083,6 +1150,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
@ -1112,8 +1180,16 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
pad_size = pad_size if pad_size is not None else self.pad_size
|
||||
if pad_size is not None:
|
||||
padded_size = (pad_size["height"], pad_size["width"])
|
||||
else:
|
||||
padded_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
@ -1121,7 +1197,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
padded_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
@ -1135,7 +1211,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
|
||||
if return_pixel_mask:
|
||||
masks = [
|
||||
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
|
||||
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data["pixel_mask"] = masks
|
||||
@ -1169,6 +1245,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@ -1196,7 +1273,15 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
do_resize (`bool`, *optional*, defaults to self.do_resize):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to self.size):
|
||||
Size of the image after resizing.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to self.resample):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
|
||||
@ -1214,8 +1299,9 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||
Standard deviation to use when normalizing the image.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
|
||||
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
|
||||
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@ -1231,6 +1317,10 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
logger.warning_once(
|
||||
@ -1260,6 +1350,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
pad_size = self.pad_size if pad_size is None else pad_size
|
||||
format = self.format if format is None else format
|
||||
|
||||
images = make_list_of_images(images)
|
||||
@ -1384,6 +1475,7 @@ class DetrImageProcessor(BaseImageProcessor):
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=do_convert_annotations,
|
||||
return_tensors=return_tensors,
|
||||
pad_size=pad_size,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
|
@ -152,6 +152,42 @@ def get_resize_output_image_size(
|
||||
return get_size_with_aspect_ratio(image_size, size, max_size)
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
|
||||
def get_image_size_for_max_height_width(
|
||||
input_image: np.ndarray,
|
||||
max_height: int,
|
||||
max_width: int,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
|
||||
Important, even if image_height < max_height and image_width < max_width, the image will be resized
|
||||
to at least one of the edges be equal to max_height or max_width.
|
||||
|
||||
For example:
|
||||
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
|
||||
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
|
||||
|
||||
Args:
|
||||
input_image (`np.ndarray`):
|
||||
The image to resize.
|
||||
max_height (`int`):
|
||||
The maximum allowed height.
|
||||
max_width (`int`):
|
||||
The maximum allowed width.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
|
||||
"""
|
||||
image_size = get_image_size(input_image, input_data_format)
|
||||
height, width = image_size
|
||||
height_scale = max_height / height
|
||||
width_scale = max_width / width
|
||||
min_scale = min(height_scale, width_scale)
|
||||
new_height = int(height * min_scale)
|
||||
new_width = int(width * min_scale)
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
|
||||
def get_numpy_to_framework_fn(arr) -> Callable:
|
||||
"""
|
||||
@ -773,8 +809,16 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
|
||||
overridden by the `do_resize` parameter in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
||||
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
|
||||
the `preprocess` method.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
||||
in the `preprocess` method. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
@ -798,8 +842,14 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
|
||||
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
|
||||
overridden by the `do_pad` parameter in the `preprocess` method.
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
|
||||
If `pad_size` is provided, the image will be padded to the specified dimensions.
|
||||
Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@ -818,6 +868,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
@ -851,6 +902,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
self.pad_size = pad_size
|
||||
self._valid_processor_keys = [
|
||||
"images",
|
||||
"annotations",
|
||||
@ -866,6 +918,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
"image_mean",
|
||||
"image_std",
|
||||
"do_pad",
|
||||
"pad_size",
|
||||
"format",
|
||||
"return_tensors",
|
||||
"data_format",
|
||||
@ -938,8 +991,15 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`Dict[str, int]`):
|
||||
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
|
||||
`height` and `width`.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
@ -958,18 +1018,27 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
max_size = None
|
||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||
if "shortest_edge" in size and "longest_edge" in size:
|
||||
size = get_resize_output_image_size(
|
||||
new_size = get_resize_output_image_size(
|
||||
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
|
||||
)
|
||||
elif "max_height" in size and "max_width" in size:
|
||||
new_size = get_image_size_for_max_height_width(
|
||||
image, size["max_height"], size["max_width"], input_data_format=input_data_format
|
||||
)
|
||||
elif "height" in size and "width" in size:
|
||||
size = (size["height"], size["width"])
|
||||
new_size = (size["height"], size["width"])
|
||||
else:
|
||||
raise ValueError(
|
||||
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
|
||||
f" {size.keys()}."
|
||||
)
|
||||
image = resize(
|
||||
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
|
||||
image,
|
||||
size=new_size,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
**kwargs,
|
||||
)
|
||||
return image
|
||||
|
||||
@ -1113,6 +1182,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
@ -1142,8 +1212,16 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
pad_size = pad_size if pad_size is not None else self.pad_size
|
||||
if pad_size is not None:
|
||||
padded_size = (pad_size["height"], pad_size["width"])
|
||||
else:
|
||||
padded_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
@ -1151,7 +1229,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
padded_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
@ -1165,7 +1243,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
|
||||
if return_pixel_mask:
|
||||
masks = [
|
||||
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
|
||||
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data["pixel_mask"] = masks
|
||||
@ -1200,6 +1278,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@ -1227,7 +1306,15 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
do_resize (`bool`, *optional*, defaults to self.do_resize):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to self.size):
|
||||
Size of the image after resizing.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to self.resample):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
|
||||
@ -1245,8 +1332,9 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
|
||||
Standard deviation to use when normalizing the image.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
|
||||
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
|
||||
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@ -1262,6 +1350,10 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
logger.warning_once(
|
||||
@ -1291,6 +1383,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
pad_size = self.pad_size if pad_size is None else pad_size
|
||||
format = self.format if format is None else format
|
||||
|
||||
images = make_list_of_images(images)
|
||||
@ -1415,6 +1508,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=do_convert_annotations,
|
||||
return_tensors=return_tensors,
|
||||
pad_size=pad_size,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
|
@ -133,6 +133,42 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
|
||||
return (height, width)
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
|
||||
def get_image_size_for_max_height_width(
|
||||
input_image: np.ndarray,
|
||||
max_height: int,
|
||||
max_width: int,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
|
||||
Important, even if image_height < max_height and image_width < max_width, the image will be resized
|
||||
to at least one of the edges be equal to max_height or max_width.
|
||||
|
||||
For example:
|
||||
- input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
|
||||
- input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
|
||||
|
||||
Args:
|
||||
input_image (`np.ndarray`):
|
||||
The image to resize.
|
||||
max_height (`int`):
|
||||
The maximum allowed height.
|
||||
max_width (`int`):
|
||||
The maximum allowed width.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format of the input image. If not provided, it will be inferred from the input image.
|
||||
"""
|
||||
image_size = get_image_size(input_image, input_data_format)
|
||||
height, width = image_size
|
||||
height_scale = max_height / height
|
||||
width_scale = max_width / width
|
||||
min_scale = min(height_scale, width_scale)
|
||||
new_height = int(height * min_scale)
|
||||
new_width = int(width * min_scale)
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
|
||||
def get_resize_output_image_size(
|
||||
input_image: np.ndarray,
|
||||
@ -678,8 +714,16 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
|
||||
overridden by the `do_resize` parameter in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
|
||||
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
|
||||
the `preprocess` method.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
|
||||
in the `preprocess` method. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
@ -699,8 +743,13 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
|
||||
method. If `True` will pad the images in the batch to the largest height and width in the batch.
|
||||
Padding will be applied to the bottom and right of the image with zeros.
|
||||
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
|
||||
If `pad_size` is provided, the image will be padded to the specified dimensions.
|
||||
Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values", "pixel_mask"]
|
||||
@ -718,6 +767,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
image_std: Union[float, List[float]] = None,
|
||||
do_convert_annotations: Optional[bool] = None,
|
||||
do_pad: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
@ -751,6 +801,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
|
||||
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
|
||||
self.do_pad = do_pad
|
||||
self.pad_size = pad_size
|
||||
self._valid_processor_keys = [
|
||||
"images",
|
||||
"annotations",
|
||||
@ -766,6 +817,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
"image_std",
|
||||
"do_convert_annotations",
|
||||
"do_pad",
|
||||
"pad_size",
|
||||
"format",
|
||||
"return_tensors",
|
||||
"data_format",
|
||||
@ -838,8 +890,15 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
image (`np.ndarray`):
|
||||
Image to resize.
|
||||
size (`Dict[str, int]`):
|
||||
Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or
|
||||
`height` and `width`.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image.
|
||||
data_format (`str` or `ChannelDimension`, *optional*):
|
||||
@ -858,18 +917,27 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
max_size = None
|
||||
size = get_size_dict(size, max_size=max_size, default_to_square=False)
|
||||
if "shortest_edge" in size and "longest_edge" in size:
|
||||
size = get_resize_output_image_size(
|
||||
new_size = get_resize_output_image_size(
|
||||
image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
|
||||
)
|
||||
elif "max_height" in size and "max_width" in size:
|
||||
new_size = get_image_size_for_max_height_width(
|
||||
image, size["max_height"], size["max_width"], input_data_format=input_data_format
|
||||
)
|
||||
elif "height" in size and "width" in size:
|
||||
size = (size["height"], size["width"])
|
||||
new_size = (size["height"], size["width"])
|
||||
else:
|
||||
raise ValueError(
|
||||
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
|
||||
f" {size.keys()}."
|
||||
)
|
||||
image = resize(
|
||||
image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
|
||||
image,
|
||||
size=new_size,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
**kwargs,
|
||||
)
|
||||
return image
|
||||
|
||||
@ -1012,6 +1080,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
data_format: Optional[ChannelDimension] = None,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
update_bboxes: bool = True,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
|
||||
@ -1042,8 +1111,16 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
Whether to update the bounding boxes in the annotations to match the padded images. If the
|
||||
bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
|
||||
format, the bounding boxes will not be updated.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
pad_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
pad_size = pad_size if pad_size is not None else self.pad_size
|
||||
if pad_size is not None:
|
||||
padded_size = (pad_size["height"], pad_size["width"])
|
||||
else:
|
||||
padded_size = get_max_height_width(images, input_data_format=input_data_format)
|
||||
|
||||
annotation_list = annotations if annotations is not None else [None] * len(images)
|
||||
padded_images = []
|
||||
@ -1051,7 +1128,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
for image, annotation in zip(images, annotation_list):
|
||||
padded_image, padded_annotation = self._pad_image(
|
||||
image,
|
||||
pad_size,
|
||||
padded_size,
|
||||
annotation,
|
||||
constant_values=constant_values,
|
||||
data_format=data_format,
|
||||
@ -1065,7 +1142,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
|
||||
if return_pixel_mask:
|
||||
masks = [
|
||||
make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
|
||||
make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
|
||||
for image in images
|
||||
]
|
||||
data["pixel_mask"] = masks
|
||||
@ -1099,6 +1176,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
return_tensors: Optional[Union[TensorType, str]] = None,
|
||||
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
pad_size: Optional[Dict[str, int]] = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@ -1126,7 +1204,15 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
do_resize (`bool`, *optional*, defaults to self.do_resize):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to self.size):
|
||||
Size of the image after resizing.
|
||||
Size of the image's `(height, width)` dimensions after resizing. Available options are:
|
||||
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
|
||||
Do NOT keep the aspect ratio.
|
||||
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
|
||||
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
|
||||
less or equal to `longest_edge`.
|
||||
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
|
||||
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
|
||||
`max_width`.
|
||||
resample (`PILImageResampling`, *optional*, defaults to self.resample):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
|
||||
@ -1144,8 +1230,9 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
|
||||
and in relative coordinates.
|
||||
do_pad (`bool`, *optional*, defaults to self.do_pad):
|
||||
Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch
|
||||
and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros.
|
||||
Whether to pad the image. If `True`, padding will be applied to the bottom and right of
|
||||
the image with zeros. If `pad_size` is provided, the image will be padded to the specified
|
||||
dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
|
||||
format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
|
||||
Format of the annotations.
|
||||
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
|
||||
@ -1158,6 +1245,10 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
pad_size (`Dict[str, int]`, *optional*):
|
||||
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
|
||||
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
|
||||
height and width in the batch.
|
||||
"""
|
||||
if "pad_and_return_pixel_mask" in kwargs:
|
||||
logger.warning_once(
|
||||
@ -1187,6 +1278,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
|
||||
)
|
||||
do_pad = self.do_pad if do_pad is None else do_pad
|
||||
pad_size = self.pad_size if pad_size is None else pad_size
|
||||
format = self.format if format is None else format
|
||||
validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
|
||||
|
||||
@ -1310,6 +1402,7 @@ class YolosImageProcessor(BaseImageProcessor):
|
||||
input_data_format=input_data_format,
|
||||
update_bboxes=do_convert_annotations,
|
||||
return_tensors=return_tensors,
|
||||
pad_size=pad_size,
|
||||
)
|
||||
else:
|
||||
images = [
|
||||
|
@ -490,3 +490,50 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr
|
||||
def test_max_width_max_height_resizing_and_pad_strategy(self):
|
||||
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
|
||||
image_processor = ConditionalDetrImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
|
||||
|
||||
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
|
||||
image_processor = ConditionalDetrImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
|
||||
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
|
||||
image_processor = ConditionalDetrImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
|
||||
|
||||
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
|
||||
image_processor = ConditionalDetrImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 301, "width": 101},
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
|
||||
|
||||
### Check for batch
|
||||
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
|
||||
image_processor = ConditionalDetrImageProcessor(
|
||||
size={"max_height": 150, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 150, "width": 100},
|
||||
)
|
||||
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
|
||||
|
@ -492,3 +492,50 @@ class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessi
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr
|
||||
def test_max_width_max_height_resizing_and_pad_strategy(self):
|
||||
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
|
||||
image_processor = DeformableDetrImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
|
||||
|
||||
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
|
||||
image_processor = DeformableDetrImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
|
||||
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
|
||||
image_processor = DeformableDetrImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
|
||||
|
||||
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
|
||||
image_processor = DeformableDetrImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 301, "width": 101},
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
|
||||
|
||||
### Check for batch
|
||||
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
|
||||
image_processor = DeformableDetrImageProcessor(
|
||||
size={"max_height": 150, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 150, "width": 100},
|
||||
)
|
||||
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
|
||||
|
@ -486,3 +486,50 @@ class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Deta
|
||||
def test_max_width_max_height_resizing_and_pad_strategy(self):
|
||||
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
|
||||
image_processor = DetaImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
|
||||
|
||||
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
|
||||
image_processor = DetaImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
|
||||
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
|
||||
image_processor = DetaImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
|
||||
|
||||
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
|
||||
image_processor = DetaImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 301, "width": 101},
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
|
||||
|
||||
### Check for batch
|
||||
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
|
||||
image_processor = DetaImageProcessor(
|
||||
size={"max_height": 150, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 150, "width": 100},
|
||||
)
|
||||
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
|
||||
|
@ -547,3 +547,49 @@ class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixi
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
def test_max_width_max_height_resizing_and_pad_strategy(self):
|
||||
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
|
||||
image_processor = DetrImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
|
||||
|
||||
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
|
||||
image_processor = DetrImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
|
||||
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
|
||||
image_processor = DetrImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
|
||||
|
||||
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
|
||||
image_processor = DetrImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 301, "width": 101},
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
|
||||
|
||||
### Check for batch
|
||||
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
|
||||
image_processor = DetrImageProcessor(
|
||||
size={"max_height": 150, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 150, "width": 100},
|
||||
)
|
||||
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
|
||||
|
@ -528,3 +528,50 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino
|
||||
def test_max_width_max_height_resizing_and_pad_strategy(self):
|
||||
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
|
||||
image_processor = GroundingDinoImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
|
||||
|
||||
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
|
||||
image_processor = GroundingDinoImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
|
||||
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
|
||||
image_processor = GroundingDinoImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
|
||||
|
||||
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
|
||||
image_processor = GroundingDinoImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 301, "width": 101},
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
|
||||
|
||||
### Check for batch
|
||||
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
|
||||
image_processor = GroundingDinoImageProcessor(
|
||||
size={"max_height": 150, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 150, "width": 100},
|
||||
)
|
||||
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
|
||||
|
@ -546,3 +546,50 @@ class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMix
|
||||
).T
|
||||
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1))
|
||||
self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1))
|
||||
|
||||
# Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos
|
||||
def test_max_width_max_height_resizing_and_pad_strategy(self):
|
||||
image_1 = torch.ones([200, 100, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
|
||||
image_processor = YolosImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50]))
|
||||
|
||||
# do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
|
||||
image_processor = YolosImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=False,
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
|
||||
# do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
|
||||
image_processor = YolosImageProcessor(
|
||||
size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100]))
|
||||
|
||||
# do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
|
||||
image_processor = YolosImageProcessor(
|
||||
size={"max_height": 300, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 301, "width": 101},
|
||||
)
|
||||
inputs = image_processor(images=[image_1], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101]))
|
||||
|
||||
### Check for batch
|
||||
image_2 = torch.ones([100, 150, 3], dtype=torch.uint8)
|
||||
|
||||
# do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
|
||||
image_processor = YolosImageProcessor(
|
||||
size={"max_height": 150, "max_width": 100},
|
||||
do_pad=True,
|
||||
pad_size={"height": 150, "width": 100},
|
||||
)
|
||||
inputs = image_processor(images=[image_1, image_2], return_tensors="pt")
|
||||
self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))
|
||||
|
Loading…
Reference in New Issue
Block a user