diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index ef9f5284a3a..37948ad9e8e 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -969,7 +969,30 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): """ image_processor_dict = image_processor_dict.copy() if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") + max_size = kwargs.pop("max_size") + # Check for size in both image_processor_dict and kwargs + size = kwargs.get("size", image_processor_dict.get("size")) + + if size is not None: + # If size is an integer, convert to shortest_edge dict + if isinstance(size, int): + size = {"shortest_edge": size} + # If size is a dict but missing longest_edge, add it + elif isinstance(size, dict) and "longest_edge" not in size: + size = dict(size) # Make a copy + + if isinstance(size, dict) and "longest_edge" not in size: + size["longest_edge"] = max_size + + # Update both locations if size was in kwargs + if "size" in kwargs: + kwargs["size"] = size + else: + image_processor_dict["size"] = size + else: + # If no size provided, create default size with max_size + image_processor_dict["size"] = {"shortest_edge": 800, "longest_edge": max_size} + if "pad_and_return_pixel_mask" in kwargs: image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") return super().from_dict(image_processor_dict, **kwargs) @@ -1433,8 +1456,11 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): else: # If size is already provided, we need to handle max_size appropriately if isinstance(size, dict) and "longest_edge" not in size: - size = get_size_dict(size, max_size=max_size, default_to_square=False) + size = dict(size) # Make a copy to avoid modifying the original + size["longest_edge"] = max_size # If size already has longest_edge, the max_size is ignored (deprecated behavior) + else: + max_size = None if size is None else 1333 do_resize = self.do_resize if do_resize is None else do_resize size = self.size if size is None else size @@ -1650,11 +1676,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection with DeformableDetr->ConditionalDetr def post_process_object_detection( - self, - outputs, - threshold: float = 0.5, - target_sizes: Union[TensorType, list[tuple]] = None, - top_k: int = 100, + self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 ): """ Converts the raw output of [`ConditionalDetrForObjectDetection`] into final bounding boxes in (top_left_x, diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index 9458062a45f..b7c061503eb 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -946,7 +946,30 @@ class DeformableDetrImageProcessor(BaseImageProcessor): """ image_processor_dict = image_processor_dict.copy() if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") + max_size = kwargs.pop("max_size") + # Check for size in both image_processor_dict and kwargs + size = kwargs.get("size", image_processor_dict.get("size")) + + if size is not None: + # If size is an integer, convert to shortest_edge dict + if isinstance(size, int): + size = {"shortest_edge": size} + # If size is a dict but missing longest_edge, add it + elif isinstance(size, dict) and "longest_edge" not in size: + size = dict(size) # Make a copy + + if isinstance(size, dict) and "longest_edge" not in size: + size["longest_edge"] = max_size + + # Update both locations if size was in kwargs + if "size" in kwargs: + kwargs["size"] = size + else: + image_processor_dict["size"] = size + else: + # If no size provided, create default size with max_size + image_processor_dict["size"] = {"shortest_edge": 800, "longest_edge": max_size} + if "pad_and_return_pixel_mask" in kwargs: image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") return super().from_dict(image_processor_dict, **kwargs) @@ -1034,11 +1057,17 @@ class DeformableDetrImageProcessor(BaseImageProcessor): size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: new_size = get_resize_output_image_size( - image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format + image, + size["shortest_edge"], + size["longest_edge"], + input_data_format=input_data_format, ) elif "max_height" in size and "max_width" in size: new_size = get_image_size_for_max_height_width( - image, size["max_height"], size["max_width"], input_data_format=input_data_format + image, + size["max_height"], + size["max_width"], + input_data_format=input_data_format, ) elif "height" in size and "width" in size: new_size = (size["height"], size["width"]) @@ -1098,7 +1127,12 @@ class DeformableDetrImageProcessor(BaseImageProcessor): - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. """ - return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + return rescale( + image, + rescale_factor, + data_format=data_format, + input_data_format=input_data_format, + ) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict: @@ -1182,7 +1216,11 @@ class DeformableDetrImageProcessor(BaseImageProcessor): ) if annotation is not None: annotation = self._update_annotation_for_padded_image( - annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes + annotation, + (input_height, input_width), + (output_height, output_width), + padding, + update_bboxes, ) return padded_image, annotation @@ -1258,7 +1296,11 @@ class DeformableDetrImageProcessor(BaseImageProcessor): if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) + make_pixel_mask( + image=image, + output_size=padded_size, + input_data_format=input_data_format, + ) for image in images ] data["pixel_mask"] = masks @@ -1391,6 +1433,8 @@ class DeformableDetrImageProcessor(BaseImageProcessor): size = dict(size) # Make a copy to avoid modifying the original size["longest_edge"] = max_size # If size already has longest_edge, the max_size is ignored (deprecated behavior) + else: + max_size = None if size is None else 1333 do_resize = self.do_resize if do_resize is None else do_resize size = self.size if size is None else size @@ -1415,7 +1459,10 @@ class DeformableDetrImageProcessor(BaseImageProcessor): "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "torch.Tensor, tf.Tensor or jax.ndarray." ) - validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + validate_kwargs( + captured_kwargs=kwargs.keys(), + valid_processor_keys=self._valid_processor_keys, + ) # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated. validate_preprocess_arguments( diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index 19f4d7439eb..048bd816743 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -950,7 +950,30 @@ class DetrImageProcessor(BaseImageProcessor): """ image_processor_dict = image_processor_dict.copy() if "max_size" in kwargs: - image_processor_dict["max_size"] = kwargs.pop("max_size") + max_size = kwargs.pop("max_size") + # Check for size in both image_processor_dict and kwargs + size = kwargs.get("size", image_processor_dict.get("size")) + + if size is not None: + # If size is an integer, convert to shortest_edge dict + if isinstance(size, int): + size = {"shortest_edge": size} + # If size is a dict but missing longest_edge, add it + elif isinstance(size, dict) and "longest_edge" not in size: + size = dict(size) # Make a copy + + if isinstance(size, dict) and "longest_edge" not in size: + size["longest_edge"] = max_size + + # Update both locations if size was in kwargs + if "size" in kwargs: + kwargs["size"] = size + else: + image_processor_dict["size"] = size + else: + # If no size provided, create default size with max_size + image_processor_dict["size"] = {"shortest_edge": 800, "longest_edge": max_size} + if "pad_and_return_pixel_mask" in kwargs: image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") return super().from_dict(image_processor_dict, **kwargs) diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py.backup b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py.backup new file mode 100644 index 00000000000..039828f16ff --- /dev/null +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py.backup @@ -0,0 +1,1667 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Deformable DETR.""" + +import io +import pathlib +from collections import defaultdict +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils import BaseImageProcessor, get_size_dict +from ...image_transforms import ( + PaddingMode, + center_to_corners_format, + corners_to_center_format, + id_to_rgb, + pad, + rescale, + resize, + rgb_to_id, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_annotations, + validate_kwargs, + validate_preprocess_arguments, +) +from ...utils import ( + ExplicitEnum, + TensorType, + is_flax_available, + is_jax_tensor, + is_scipy_available, + is_tf_available, + is_tf_tensor, + is_torch_available, + is_torch_tensor, + is_vision_available, + logging, +) + + +if is_torch_available(): + import torch + from torch import nn + + +if is_vision_available(): + import PIL + +if is_scipy_available(): + import scipy.special + import scipy.stats + +if TYPE_CHECKING: + from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +AnnotationType = dict[str, Union[int, str, list[dict]]] + + +class AnnotationFormat(ExplicitEnum): + COCO_DETECTION = "coco_detection" + COCO_PANOPTIC = "coco_panoptic" + + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + raw_size = None + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) + + if (height <= width and height == size) or (width <= height and width == size): + oh, ow = height, width + elif width < height: + ow = size + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) + else: + oh = size + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + + return (oh, ow) + + +# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, tuple[int, int], list[int]], + max_size: Optional[int] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. If the desired output size + is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output + image size is computed by keeping the aspect ratio of the input image size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `tuple[int, int]` or `list[int]`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + if isinstance(size, (list, tuple)): + return size + + return get_size_with_aspect_ratio(image_size, size, max_size) + + +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + +# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn +def get_numpy_to_framework_fn(arr) -> Callable: + """ + Returns a function that converts a numpy array to the framework of the input array. + + Args: + arr (`np.ndarray`): The array to convert. + """ + if isinstance(arr, np.ndarray): + return np.array + if is_tf_available() and is_tf_tensor(arr): + import tensorflow as tf + + return tf.convert_to_tensor + if is_torch_available() and is_torch_tensor(arr): + import torch + + return torch.tensor + if is_flax_available() and is_jax_tensor(arr): + import jax.numpy as jnp + + return jnp.array + raise ValueError(f"Cannot convert arrays of type {type(arr)}") + + +# Copied from transformers.models.detr.image_processing_detr.safe_squeeze +def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: + """ + Squeezes an array, but only if the axis specified has dim 1. + """ + if axis is None: + return arr.squeeze() + + try: + return arr.squeeze(axis=axis) + except ValueError: + return arr + + +# Copied from transformers.models.detr.image_processing_detr.normalize_annotation +def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + +# Copied from transformers.models.detr.image_processing_detr.max_across_indices +def max_across_indices(values: Iterable[Any]) -> list[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.detr.image_processing_detr.get_max_height_width +def get_max_height_width( + images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> list[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask +def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`list[list[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = np.asarray(mask, dtype=np.uint8) + mask = np.any(mask, axis=2) + masks.append(mask) + if masks: + masks = np.stack(masks, axis=0) + else: + masks = np.zeros((0, height, width), dtype=np.uint8) + + return masks + + +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->GroundingDino +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by GroundingDino. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + + classes = [obj["category_id"] for obj in annotations] + classes = np.asarray(classes, dtype=np.int64) + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) + iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64) + + boxes = [obj["bbox"] for obj in annotations] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = {} + new_target["image_id"] = image_id + new_target["class_labels"] = classes[keep] + new_target["boxes"] = boxes[keep] + new_target["area"] = area[keep] + new_target["iscrowd"] = iscrowd[keep] + new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) + + if annotations and "keypoints" in annotations[0]: + keypoints = [obj["keypoints"] for obj in annotations] + # Converting the filtered keypoints list to a numpy array + keypoints = np.asarray(keypoints, dtype=np.float32) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width) + new_target["masks"] = masks[keep] + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes +def masks_to_boxes(masks: np.ndarray) -> np.ndarray: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.size == 0: + return np.zeros((0, 4)) + + h, w = masks.shape[-2:] + y = np.arange(0, h, dtype=np.float32) + x = np.arange(0, w, dtype=np.float32) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = np.meshgrid(y, x, indexing="ij") + + x_mask = masks * np.expand_dims(x, axis=0) + x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1) + x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool))) + x_min = x.filled(fill_value=1e8) + x_min = x_min.reshape(x_min.shape[0], -1).min(-1) + + y_mask = masks * np.expand_dims(y, axis=0) + y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1) + y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool))) + y_min = y.filled(fill_value=1e8) + y_min = y_min.reshape(y_min.shape[0], -1).min(-1) + + return np.stack([x_min, y_min, x_max, y_max], 1) + + +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->GroundingDino +def prepare_coco_panoptic_annotation( + image: np.ndarray, + target: dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> dict: + """ + Prepare a coco panoptic annotation for GroundingDino. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64) + new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64) + new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64) + + if "segments_info" in target: + masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32) + masks = rgb_to_id(masks) + + ids = np.array([segment_info["id"] for segment_info in target["segments_info"]]) + masks = masks == ids[:, None, None] + masks = masks.astype(np.uint8) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = np.array( + [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["iscrowd"] = np.asarray( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["area"] = np.asarray( + [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32 + ) + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image +def get_segmentation_image( + masks: np.ndarray, input_size: tuple, target_size: tuple, stuff_equiv_classes, deduplicate=False +): + h, w = input_size + final_h, final_w = target_size + + m_id = scipy.special.softmax(masks.transpose(0, 1), -1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = np.zeros((h, w), dtype=np.int64) + else: + m_id = m_id.argmax(-1).reshape(h, w) + + if deduplicate: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + for eq_id in equiv: + m_id[m_id == eq_id] = equiv[0] + + seg_img = id_to_rgb(m_id) + seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST) + return seg_img + + +# Copied from transformers.models.detr.image_processing_detr.get_mask_area +def get_mask_area(seg_img: np.ndarray, target_size: tuple[int, int], n_classes: int) -> np.ndarray: + final_h, final_w = target_size + np_seg_img = seg_img.astype(np.uint8) + np_seg_img = np_seg_img.reshape(final_h, final_w, 3) + m_id = rgb_to_id(np_seg_img) + area = [(m_id == i).sum() for i in range(n_classes)] + return area + + +# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities +def score_labels_from_class_probabilities(logits: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + probs = scipy.special.softmax(logits, axis=-1) + labels = probs.argmax(-1, keepdims=True) + scores = np.take_along_axis(probs, labels, axis=-1) + scores, labels = scores.squeeze(-1), labels.squeeze(-1) + return scores, labels + + +# Copied from transformers.models.detr.image_processing_detr.post_process_panoptic_sample +def post_process_panoptic_sample( + out_logits: np.ndarray, + masks: np.ndarray, + boxes: np.ndarray, + processed_size: tuple[int, int], + target_size: tuple[int, int], + is_thing_map: dict, + threshold=0.85, +) -> dict: + """ + Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample. + + Args: + out_logits (`torch.Tensor`): + The logits for this sample. + masks (`torch.Tensor`): + The predicted segmentation masks for this sample. + boxes (`torch.Tensor`): + The prediced bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y, + width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding). + processed_size (`tuple[int, int]`): + The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size + after data augmentation but before batching. + target_size (`tuple[int, int]`): + The target size of the image, `(height, width)` corresponding to the requested final size of the + prediction. + is_thing_map (`Dict`): + A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not. + threshold (`float`, *optional*, defaults to 0.85): + The threshold used to binarize the segmentation masks. + """ + # we filter empty queries and detection below threshold + scores, labels = score_labels_from_class_probabilities(out_logits) + keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold) + + cur_scores = scores[keep] + cur_classes = labels[keep] + cur_boxes = center_to_corners_format(boxes[keep]) + + if len(cur_boxes) != len(cur_classes): + raise ValueError("Not as many boxes as there are classes") + + cur_masks = masks[keep] + cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR) + cur_masks = safe_squeeze(cur_masks, 1) + b, h, w = cur_masks.shape + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.reshape(b, -1) + stuff_equiv_classes = defaultdict(list) + for k, label in enumerate(cur_classes): + if not is_thing_map[label]: + stuff_equiv_classes[label].append(k) + + seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores)) + + # We filter out any mask that is too small + if cur_classes.size() > 0: + # We know filter empty masks as long as we find some + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + while filtered_small.any(): + cur_masks = cur_masks[~filtered_small] + cur_scores = cur_scores[~filtered_small] + cur_classes = cur_classes[~filtered_small] + seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores)) + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + else: + cur_classes = np.ones((1, 1), dtype=np.int64) + + segments_info = [ + {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a} + for i, (cat, a) in enumerate(zip(cur_classes, area)) + ] + del cur_classes + + with io.BytesIO() as out: + PIL.Image.fromarray(seg_img).save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + + return predictions + + +# Copied from transformers.models.detr.image_processing_detr.resize_annotation +def resize_annotation( + annotation: dict[str, Any], + orig_size: tuple[int, int], + target_size: tuple[int, int], + threshold: float = 0.5, + resample: PILImageResampling = PILImageResampling.NEAREST, +): + """ + Resizes an annotation to a target size. + + Args: + annotation (`dict[str, Any]`): + The annotation dictionary. + orig_size (`tuple[int, int]`): + The original size of the input image. + target_size (`tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) + ratio_height, ratio_width = ratios + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) + masks = masks.astype(np.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + +# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle +def binary_mask_to_rle(mask): + """ + Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + mask (`torch.Tensor` or `numpy.array`): + A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target + segment_id or class_id. + Returns: + `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE + format. + """ + if is_torch_tensor(mask): + mask = mask.numpy() + + pixels = mask.flatten() + pixels = np.concatenate([[0], pixels, [0]]) + runs = np.where(pixels[1:] != pixels[:-1])[0] + 1 + runs[1::2] -= runs[::2] + return list(runs) + + +# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle +def convert_segmentation_to_rle(segmentation): + """ + Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + segmentation (`torch.Tensor` or `numpy.array`): + A segmentation map of shape `(height, width)` where each value denotes a segment or class id. + Returns: + `list[List]`: A list of lists, where each list is the run-length encoding of a segment / class id. + """ + segment_ids = torch.unique(segmentation) + + run_length_encodings = [] + for idx in segment_ids: + mask = torch.where(segmentation == idx, 1, 0) + rle = binary_mask_to_rle(mask) + run_length_encodings.append(rle) + + return run_length_encodings + + +# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects +def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels): + """ + Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and + `labels`. + + Args: + masks (`torch.Tensor`): + A tensor of shape `(num_queries, height, width)`. + scores (`torch.Tensor`): + A tensor of shape `(num_queries)`. + labels (`torch.Tensor`): + A tensor of shape `(num_queries)`. + object_mask_threshold (`float`): + A number between 0 and 1 used to binarize the masks. + Raises: + `ValueError`: Raised when the first dimension doesn't match in all input tensors. + Returns: + `tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region + < `object_mask_threshold`. + """ + if not (masks.shape[0] == scores.shape[0] == labels.shape[0]): + raise ValueError("mask, scores and labels must have the same shape!") + + to_keep = labels.ne(num_labels) & (scores > object_mask_threshold) + + return masks[to_keep], scores[to_keep], labels[to_keep] + + +# Copied from transformers.models.detr.image_processing_detr.check_segment_validity +def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8): + # Get the mask associated with the k class + mask_k = mask_labels == k + mask_k_area = mask_k.sum() + + # Compute the area of all the stuff in query k + original_area = (mask_probs[k] >= mask_threshold).sum() + mask_exists = mask_k_area > 0 and original_area > 0 + + # Eliminate disconnected tiny segments + if mask_exists: + area_ratio = mask_k_area / original_area + if not area_ratio.item() > overlap_mask_area_threshold: + mask_exists = False + + return mask_exists, mask_k + + +# Copied from transformers.models.detr.image_processing_detr.compute_segments +def compute_segments( + mask_probs, + pred_scores, + pred_labels, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + label_ids_to_fuse: Optional[set[int]] = None, + target_size: Optional[tuple[int, int]] = None, +): + height = mask_probs.shape[1] if target_size is None else target_size[0] + width = mask_probs.shape[2] if target_size is None else target_size[1] + + segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device) + segments: list[dict] = [] + + if target_size is not None: + mask_probs = nn.functional.interpolate( + mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False + )[0] + + current_segment_id = 0 + + # Weigh each mask by its prediction score + mask_probs *= pred_scores.view(-1, 1, 1) + mask_labels = mask_probs.argmax(0) # [height, width] + + # Keep track of instances of each class + stuff_memory_list: dict[str, int] = {} + for k in range(pred_labels.shape[0]): + pred_class = pred_labels[k].item() + should_fuse = pred_class in label_ids_to_fuse + + # Check if mask exists and large enough to be a segment + mask_exists, mask_k = check_segment_validity( + mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold + ) + + if mask_exists: + if pred_class in stuff_memory_list: + current_segment_id = stuff_memory_list[pred_class] + else: + current_segment_id += 1 + + # Add current object segment to final segmentation map + segmentation[mask_k] = current_segment_id + segment_score = round(pred_scores[k].item(), 6) + segments.append( + { + "id": current_segment_id, + "label_id": pred_class, + "was_fused": should_fuse, + "score": segment_score, + } + ) + if should_fuse: + stuff_memory_list[pred_class] = current_segment_id + + return segmentation, segments + + +# Copied from transformers.models.owlvit.image_processing_owlvit._scale_boxes +def _scale_boxes(boxes, target_sizes): + """ + Scale batch of bounding boxes to the target sizes. + + Args: + boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`): + Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format. + target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`): + Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format. + + Returns: + `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes. + """ + + if isinstance(target_sizes, (list, tuple)): + image_height = torch.tensor([i[0] for i in target_sizes]) + image_width = torch.tensor([i[1] for i in target_sizes]) + elif isinstance(target_sizes, torch.Tensor): + image_height, image_width = target_sizes.unbind(1) + else: + raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor") + + scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1) + scale_factor = scale_factor.unsqueeze(1).to(boxes.device) + boxes = boxes * scale_factor + return boxes + + +class GroundingDinoImageProcessor(BaseImageProcessor): + r""" + Constructs a Grounding DINO image processor. + + Args: + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. Controls whether to normalize the image. Can be overridden by the `do_normalize` + parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__ + def __init__( + self, + format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_annotations: Optional[bool] = None, + do_pad: bool = True, + pad_size: Optional[dict[str, int]] = None, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, max_size=max_size, default_to_square=False) + + # Backwards compatibility + if do_convert_annotations is None: + do_convert_annotations = do_normalize + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_convert_annotations = do_convert_annotations + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + self.pad_size = pad_size + self._valid_processor_keys = [ + "images", + "annotations", + "return_segmentation_masks", + "masks_path", + "do_resize", + "size", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "do_convert_annotations", + "image_mean", + "image_std", + "do_pad", + "pad_size", + "format", + "return_tensors", + "data_format", + "input_data_format", + ] + + @classmethod + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDino + def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `GroundingDinoImageProcessor.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDino + def prepare_annotation( + self, + image: np.ndarray, + target: dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> dict: + """ + Prepare an annotation for feeding into GroundingDino model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + # If size is already a dict but missing longest_edge, add it from max_size + if isinstance(size, dict) and "longest_edge" not in size: + size = dict(size) # Make a copy + size["longest_edge"] = max_size + else: + max_size = None + size = get_size_dict(size, max_size=max_size, default_to_square=False) + if "shortest_edge" in size and "longest_edge" in size: + new_size = get_resize_output_image_size( + image, + size["shortest_edge"], + size["longest_edge"], + input_data_format=input_data_format, + ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, + size["max_height"], + size["max_width"], + input_data_format=input_data_format, + ) + elif "height" in size and "width" in size: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + image = resize( + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + return image + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation + def resize_annotation( + self, + annotation, + orig_size, + size, + resample: PILImageResampling = PILImageResampling.NEAREST, + ) -> dict: + """ + Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched + to this number. + """ + return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale + def rescale( + self, + image: np.ndarray, + rescale_factor: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Rescale the image by the given factor. image = image * rescale_factor. + + Args: + image (`np.ndarray`): + Image to rescale. + rescale_factor (`float`): + The value to use for rescaling. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. If unset, is inferred from the input image. Can be + one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale( + image, + rescale_factor, + data_format=data_format, + input_data_format=input_data_format, + ) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation + def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict: + """ + Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to + `[center_x, center_y, width, height]` format and from absolute to relative pixel values. + """ + return normalize_annotation(annotation, image_size=image_size) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image + def _update_annotation_for_padded_image( + self, + annotation: dict, + input_image_size: tuple[int, int], + output_image_size: tuple[int, int], + padding, + update_bboxes, + ) -> dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = pad( + masks, + padding, + mode=PaddingMode.CONSTANT, + constant_values=0, + input_data_format=ChannelDimension.FIRST, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= np.asarray( + [ + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + ] + ) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image + def _pad_image( + self, + image: np.ndarray, + output_size: tuple[int, int], + annotation: Optional[dict[str, Any]] = None, + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, + (input_height, input_width), + (output_height, output_width), + padding, + update_bboxes, + ) + return padded_image, annotation + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad + def pad( + self, + images: list[np.ndarray], + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + pad_size: Optional[dict[str, int]] = None, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + images (list[`np.ndarray`]): + Images to pad. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + update_bboxes (`bool`, *optional*, defaults to `True`): + Whether to update the bounding boxes in the annotations to match the padded images. If the + bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` + format, the bounding boxes will not be updated. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) + + annotation_list = annotations if annotations is not None else [None] * len(images) + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotation_list): + padded_image, padded_annotation = self._pad_image( + image, + padded_size, + annotation, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=update_bboxes, + ) + padded_images.append(padded_image) + padded_annotations.append(padded_annotation) + + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask( + image=image, + output_size=padded_size, + input_data_format=input_data_format, + ) + for image in images + ] + data["pixel_mask"] = masks + + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations + ] + + return encoded_inputs + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample=None, # PILImageResampling + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + do_convert_annotations: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotationFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[dict[str, int]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to self.size): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations): + Whether to convert the annotations to the format expected by the model. Converts the bounding + boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` + and in relative coordinates. + image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `list[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. + format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + max_size = kwargs.pop("max_size") + if size is None: + size = {"shortest_edge": 800, "longest_edge": max_size} + else: + # If size is already provided, we need to handle max_size appropriately + if isinstance(size, dict) and "longest_edge" not in size: + size = dict(size) # Make a copy to avoid modifying the original + size["longest_edge"] = max_size + # If size already has longest_edge, the max_size is ignored (deprecated behavior) + else: + max_size = None if size is None else 1333 + + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_convert_annotations = ( + self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations + ) + do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size + format = self.format if format is None else format + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_kwargs( + captured_kwargs=kwargs.keys(), + valid_processor_keys=self._valid_processor_keys, + ) + + # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated. + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + prepared_images = [] + prepared_annotations = [] + for image, target in zip(images, annotations): + target = self.prepare_annotation( + image, + target, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + prepared_images.append(image) + prepared_annotations.append(target) + images = prepared_images + annotations = prepared_annotations + del prepared_images, prepared_annotations + + # transformations + if do_resize: + if annotations is not None: + resized_images, resized_annotations = [], [] + for image, target in zip(images, annotations): + orig_size = get_image_size(image, input_data_format) + resized_image = self.resize( + image, + size=size, + resample=resample, + input_data_format=input_data_format, + ) + resized_annotation = self.resize_annotation( + target, + orig_size, + get_image_size(resized_image, input_data_format), + ) + resized_images.append(resized_image) + resized_annotations.append(resized_annotation) + images = resized_images + annotations = resized_annotations + del resized_images, resized_annotations + else: + images = [ + self.resize( + image, + size=size, + resample=resample, + input_data_format=input_data_format, + ) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + + if do_convert_annotations and annotations is not None: + annotations = [ + self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + for annotation, image in zip(annotations, images) + ] + + if do_pad: + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + encoded_inputs = self.pad( + images, + annotations=annotations, + return_pixel_mask=True, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=do_convert_annotations, + return_tensors=return_tensors, + pad_size=pad_size, + ) + else: + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + + return encoded_inputs + + # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino + def post_process_object_detection( + self, + outputs: "GroundingDinoObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, list[tuple]]] = None, + ): + """ + Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`GroundingDinoObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.1): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + """ + batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes + batch_size = len(batch_logits) + + if target_sizes is not None and len(target_sizes) != batch_size: + raise ValueError("Make sure that you pass in as many target sizes as images") + + # batch_logits of shape (batch_size, num_queries, num_classes) + batch_class_logits = torch.max(batch_logits, dim=-1) + batch_scores = torch.sigmoid(batch_class_logits.values) + batch_labels = batch_class_logits.indices + + # Convert to [x0, y0, x1, y1] format + batch_boxes = center_to_corners_format(batch_boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + batch_boxes = _scale_boxes(batch_boxes, target_sizes) + + results = [] + for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes): + keep = scores > threshold + scores = scores[keep] + labels = labels[keep] + boxes = boxes[keep] + results.append({"scores": scores, "labels": labels, "boxes": boxes}) + + return results + + +__all__ = ["GroundingDinoImageProcessor"] diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index ef4f4701e7e..32494e67b27 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -71,7 +71,8 @@ def max_across_indices(values: Iterable[Any]) -> list[Any]: # Copied from transformers.models.detr.image_processing_detr.get_max_height_width def get_max_height_width( - images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None + images: list[np.ndarray], + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> list[int]: """ Get the maximum height and width across all images in a batch. @@ -90,7 +91,9 @@ def get_max_height_width( # Copied from transformers.models.detr.image_processing_detr.make_pixel_mask def make_pixel_mask( - image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None + image: np.ndarray, + output_size: tuple[int, int], + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. @@ -218,7 +221,10 @@ def compute_segments( if target_size is not None: mask_probs = nn.functional.interpolate( - mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False + mask_probs.unsqueeze(0), + size=target_size, + mode="bilinear", + align_corners=False, )[0] current_segment_id = 0 @@ -551,7 +557,12 @@ class Mask2FormerImageProcessor(BaseImageProcessor): - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. """ - return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + return rescale( + image, + rescale_factor, + data_format=data_format, + input_data_format=input_data_format, + ) # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.convert_segmentation_map_to_binary_masks def convert_segmentation_map_to_binary_masks( diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index 91bdf020f21..bc93db35d05 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -77,7 +77,8 @@ def max_across_indices(values: Iterable[Any]) -> list[Any]: # Copied from transformers.models.detr.image_processing_detr.get_max_height_width def get_max_height_width( - images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None + images: list[np.ndarray], + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> list[int]: """ Get the maximum height and width across all images in a batch. @@ -96,7 +97,9 @@ def get_max_height_width( # Copied from transformers.models.detr.image_processing_detr.make_pixel_mask def make_pixel_mask( - image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None + image: np.ndarray, + output_size: tuple[int, int], + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. @@ -224,7 +227,10 @@ def compute_segments( if target_size is not None: mask_probs = nn.functional.interpolate( - mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False + mask_probs.unsqueeze(0), + size=target_size, + mode="bilinear", + align_corners=False, )[0] current_segment_id = 0 @@ -555,7 +561,12 @@ class MaskFormerImageProcessor(BaseImageProcessor): - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. """ - return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + return rescale( + image, + rescale_factor, + data_format=data_format, + input_data_format=input_data_format, + ) def convert_segmentation_map_to_binary_masks( self, diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py index 4268edb13ce..bc493e7c241 100644 --- a/src/transformers/models/oneformer/image_processing_oneformer.py +++ b/src/transformers/models/oneformer/image_processing_oneformer.py @@ -74,7 +74,8 @@ def max_across_indices(values: Iterable[Any]) -> list[Any]: # Copied from transformers.models.detr.image_processing_detr.get_max_height_width def get_max_height_width( - images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None + images: list[np.ndarray], + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> list[int]: """ Get the maximum height and width across all images in a batch. @@ -93,7 +94,9 @@ def get_max_height_width( # Copied from transformers.models.detr.image_processing_detr.make_pixel_mask def make_pixel_mask( - image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None + image: np.ndarray, + output_size: tuple[int, int], + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. @@ -221,7 +224,10 @@ def compute_segments( if target_size is not None: mask_probs = nn.functional.interpolate( - mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False + mask_probs.unsqueeze(0), + size=target_size, + mode="bilinear", + align_corners=False, )[0] current_segment_id = 0 @@ -558,7 +564,12 @@ class OneFormerImageProcessor(BaseImageProcessor): - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. """ - return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + return rescale( + image, + rescale_factor, + data_format=data_format, + input_data_format=input_data_format, + ) # Copied from transformers.models.maskformer.image_processing_maskformer.MaskFormerImageProcessor.convert_segmentation_map_to_binary_masks def convert_segmentation_map_to_binary_masks( diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py index 9d9548e69e5..4575ad11575 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py @@ -238,7 +238,8 @@ def max_across_indices(values: Iterable[Any]) -> list[Any]: # Copied from transformers.models.detr.image_processing_detr.get_max_height_width def get_max_height_width( - images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None + images: list[np.ndarray], + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> list[int]: """ Get the maximum height and width across all images in a batch. @@ -257,7 +258,9 @@ def get_max_height_width( # Copied from transformers.models.detr.image_processing_detr.make_pixel_mask def make_pixel_mask( - image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None + image: np.ndarray, + output_size: tuple[int, int], + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. @@ -537,16 +540,26 @@ class RTDetrImageProcessor(BaseImageProcessor): "Please specify in `size['longest_edge'] instead`.", ) max_size = kwargs.pop("max_size") + # If size is already a dict but missing longest_edge, add it from max_size + if isinstance(size, dict) and "longest_edge" not in size: + size = dict(size) # Make a copy + size["longest_edge"] = max_size else: max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: new_size = get_resize_output_image_size( - image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format + image, + size["shortest_edge"], + size["longest_edge"], + input_data_format=input_data_format, ) elif "max_height" in size and "max_width" in size: new_size = get_image_size_for_max_height_width( - image, size["max_height"], size["max_width"], input_data_format=input_data_format + image, + size["max_height"], + size["max_width"], + input_data_format=input_data_format, ) elif "height" in size and "width" in size: new_size = (size["height"], size["width"]) @@ -606,7 +619,12 @@ class RTDetrImageProcessor(BaseImageProcessor): - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. """ - return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + return rescale( + image, + rescale_factor, + data_format=data_format, + input_data_format=input_data_format, + ) # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict: @@ -690,7 +708,11 @@ class RTDetrImageProcessor(BaseImageProcessor): ) if annotation is not None: annotation = self._update_annotation_for_padded_image( - annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes + annotation, + (input_height, input_width), + (output_height, output_width), + padding, + update_bboxes, ) return padded_image, annotation @@ -766,7 +788,11 @@ class RTDetrImageProcessor(BaseImageProcessor): if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) + make_pixel_mask( + image=image, + output_size=padded_size, + input_data_format=input_data_format, + ) for image in images ] data["pixel_mask"] = masks diff --git a/test_fix_verification.py b/test_fix_verification.py new file mode 100644 index 00000000000..3cfb2d621e1 --- /dev/null +++ b/test_fix_verification.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +def test_all_fixes(): + print("๐Ÿงช Testing all DETR max_size parameter fixes...") + + try: + from transformers.models.conditional_detr.image_processing_conditional_detr import ConditionalDetrImageProcessor + from transformers.models.detr.image_processing_detr import DetrImageProcessor + from transformers.models.deformable_detr.image_processing_deformable_detr import DeformableDetrImageProcessor + + processors = [ + ("ConditionalDetr", ConditionalDetrImageProcessor), + ("Detr", DetrImageProcessor), + ("DeformableDetr", DeformableDetrImageProcessor) + ] + + for name, ProcessorClass in processors: + print(f"\n๐Ÿ”ง Testing {name}ImageProcessor...") + + # Test 1: from_dict with size=42, max_size=84 + processor = ProcessorClass.from_dict({ + "do_resize": True, + "do_normalize": True, + "do_pad": True, + }, size=42, max_size=84) + expected = {"shortest_edge": 42, "longest_edge": 84} + actual = processor.size + assert actual == expected, f"โŒ Test 1 failed: expected {expected}, got {actual}" + print(f"โœ… Test 1 passed: from_dict(size=42, max_size=84) = {actual}") + + # Test 2: from_dict with size dict without longest_edge + max_size + processor = ProcessorClass.from_dict({ + "do_resize": True, + "do_normalize": True, + "do_pad": True, + "size": {"shortest_edge": 100} + }, max_size=200) + expected = {"shortest_edge": 100, "longest_edge": 200} + actual = processor.size + assert actual == expected, f"โŒ Test 2 failed: expected {expected}, got {actual}" + print(f"โœ… Test 2 passed: size without longest_edge + max_size = {actual}") + + # Test 3: init with max_size only + processor = ProcessorClass(max_size=500) + expected = {"shortest_edge": 800, "longest_edge": 500} + actual = processor.size + assert actual == expected, f"โŒ Test 3 failed: expected {expected}, got {actual}" + print(f"โœ… Test 3 passed: init(max_size=500) = {actual}") + + print(f"๐ŸŽ‰ All tests passed for {name}ImageProcessor!") + + print("\n๐ŸŒŸ All DETR image processors work correctly!") + return True + + except Exception as e: + print(f"โŒ Test failed with error: {e}") + import traceback + traceback.print_exc() + return False + +if __name__ == "__main__": + success = test_all_fixes() + sys.exit(0 if success else 1) \ No newline at end of file diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py index 8612641aa4c..ea133be01e5 100644 --- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py +++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -640,7 +640,7 @@ class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcess expected_size = {"shortest_edge": 500, "longest_edge": 800} self.assertEqual(image_processor.size, expected_size) - # Test 4: from_dict with max_size (using a dict without longest_edge) + # Test 4: from_dict with max_size (using a dict without longest_edge) test_dict = {k: v for k, v in self.image_processor_dict.items() if k != "size"} test_dict["size"] = {"shortest_edge": 18} # Only shortest_edge, no longest_edge image_processor = image_processing_class.from_dict(test_dict, max_size=1100)