Add Fast Grounding-Dino Processor (#37108)

* Add Fast Grounding-Dino Processor

* Added modular file

---------

Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
This commit is contained in:
Parteek 2025-04-16 15:56:08 +05:30 committed by GitHub
parent ed53809ac5
commit 6fd87d1172
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 976 additions and 13 deletions

View File

@ -102,6 +102,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
[[autodoc]] GroundingDinoImageProcessor
- preprocess
## GroundingDinoImageProcessorFast
[[autodoc]] GroundingDinoImageProcessorFast
- preprocess
- post_process_object_detection
## GroundingDinoProcessor

View File

@ -91,7 +91,7 @@ else:
("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("glpn", ("GLPNImageProcessor",)),
("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
("grounding-dino", ("GroundingDinoImageProcessor",)),
("grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("hiera", ("BitImageProcessor", "BitImageProcessorFast")),
("idefics", ("IdeficsImageProcessor",)),

View File

@ -20,6 +20,7 @@ from ...utils.import_utils import define_import_structure
if TYPE_CHECKING:
from .configuration_grounding_dino import *
from .image_processing_grounding_dino import *
from .image_processing_grounding_dino_fast import *
from .modeling_grounding_dino import *
from .processing_grounding_dino import *
else:

View File

@ -0,0 +1,825 @@
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/grounding_dino/modular_grounding_dino.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_grounding_dino.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
import pathlib
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
from ...image_processing_utils import BatchFeature, get_size_dict
from ...image_processing_utils_fast import (
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
BaseImageProcessorFast,
DefaultFastImageProcessorKwargs,
SizeDict,
get_image_size_for_max_height_width,
get_max_height_width,
safe_squeeze,
)
from ...image_transforms import center_to_corners_format, corners_to_center_format
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
AnnotationFormat,
AnnotationType,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
validate_annotations,
)
from ...processing_utils import Unpack
from ...utils import (
TensorType,
add_start_docstrings,
is_torch_available,
is_torchvision_available,
is_torchvision_v2_available,
logging,
)
from ...utils.import_utils import requires
from .image_processing_grounding_dino import get_size_with_aspect_ratio
if TYPE_CHECKING:
from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
if is_torch_available():
import torch
if is_torchvision_v2_available():
from torchvision.io import read_image
from torchvision.transforms.v2 import functional as F
elif is_torchvision_available():
from torchvision.io import read_image
from torchvision.transforms import functional as F
logger = logging.get_logger(__name__)
class GroundingDinoFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
format: Optional[Union[str, AnnotationFormat]]
do_convert_annotations: Optional[bool]
do_pad: Optional[bool]
pad_size: Optional[Dict[str, int]]
return_segmentation_masks: Optional[bool]
SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
# inspired by https://github.com/facebookresearch/grounding_dino/blob/master/datasets/coco.py#L33
def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
"""
Convert a COCO polygon annotation to a mask.
Args:
segmentations (`List[List[float]]`):
List of polygons, each polygon represented by a list of x-y coordinates.
height (`int`):
Height of the mask.
width (`int`):
Width of the mask.
"""
try:
from pycocotools import mask as coco_mask
except ImportError:
raise ImportError("Pycocotools is not installed in your environment.")
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = torch.as_tensor(mask, dtype=torch.uint8, device=device)
mask = torch.any(mask, axis=2)
masks.append(mask)
if masks:
masks = torch.stack(masks, axis=0)
else:
masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device)
return masks
# inspired by https://github.com/facebookresearch/grounding_dino/blob/master/datasets/coco.py#L50
def prepare_coco_detection_annotation(
image,
target,
return_segmentation_masks: bool = False,
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
"""
Convert the target in COCO format into the format expected by GROUNDING_DINO.
"""
image_height, image_width = image.size()[-2:]
image_id = target["image_id"]
image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
# Get all COCO annotations for the given image.
annotations = target["annotations"]
classes = []
area = []
boxes = []
keypoints = []
for obj in annotations:
if "iscrowd" not in obj or obj["iscrowd"] == 0:
classes.append(obj["category_id"])
area.append(obj["area"])
boxes.append(obj["bbox"])
if "keypoints" in obj:
keypoints.append(obj["keypoints"])
classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
# guard against no boxes via resizing
boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2]
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
new_target = {
"image_id": image_id,
"class_labels": classes[keep],
"boxes": boxes[keep],
"area": area[keep],
"iscrowd": iscrowd[keep],
"orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
}
if keypoints:
keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
# Apply the keep mask here to filter the relevant annotations
keypoints = keypoints[keep]
num_keypoints = keypoints.shape[0]
keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
new_target["keypoints"] = keypoints
if return_segmentation_masks:
segmentation_masks = [obj["segmentation"] for obj in annotations]
masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device)
new_target["masks"] = masks[keep]
return new_target
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided panoptic segmentation masks.
Args:
masks: masks in format `[number_masks, height, width]` where N is the number of masks
Returns:
boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
"""
if masks.numel() == 0:
return torch.zeros((0, 4), device=masks.device)
h, w = masks.shape[-2:]
y = torch.arange(0, h, dtype=torch.float32, device=masks.device)
x = torch.arange(0, w, dtype=torch.float32, device=masks.device)
# see https://github.com/pytorch/pytorch/issues/50276
y, x = torch.meshgrid(y, x, indexing="ij")
x_mask = masks * torch.unsqueeze(x, 0)
x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0]
x_min = (
torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
)
y_mask = masks * torch.unsqueeze(y, 0)
y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0]
y_min = (
torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
)
return torch.stack([x_min, y_min, x_max, y_max], 1)
# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
# Copyright (c) 2018, Alexander Kirillov
# All rights reserved.
def rgb_to_id(color):
"""
Converts RGB color to unique ID.
"""
if isinstance(color, torch.Tensor) and len(color.shape) == 3:
if color.dtype == torch.uint8:
color = color.to(torch.int32)
return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
def prepare_coco_panoptic_annotation(
image: torch.Tensor,
target: Dict,
masks_path: Union[str, pathlib.Path],
return_masks: bool = True,
input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
"""
Prepare a coco panoptic annotation for GROUNDING_DINO.
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
annotation_path = pathlib.Path(masks_path) / target["file_name"]
new_target = {}
new_target["image_id"] = torch.as_tensor(
[target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device
)
new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
if "segments_info" in target:
masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device)
masks = rgb_to_id(masks)
ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
masks = masks == ids[:, None, None]
masks = masks.to(torch.bool)
if return_masks:
new_target["masks"] = masks
new_target["boxes"] = masks_to_boxes(masks)
new_target["class_labels"] = torch.as_tensor(
[segment_info["category_id"] for segment_info in target["segments_info"]],
dtype=torch.int64,
device=image.device,
)
new_target["iscrowd"] = torch.as_tensor(
[segment_info["iscrowd"] for segment_info in target["segments_info"]],
dtype=torch.int64,
device=image.device,
)
new_target["area"] = torch.as_tensor(
[segment_info["area"] for segment_info in target["segments_info"]],
dtype=torch.float32,
device=image.device,
)
return new_target
def _scale_boxes(boxes, target_sizes):
"""
Scale batch of bounding boxes to the target sizes.
Args:
boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
Returns:
`torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
"""
if isinstance(target_sizes, (list, tuple)):
image_height = torch.tensor([i[0] for i in target_sizes])
image_width = torch.tensor([i[1] for i in target_sizes])
elif isinstance(target_sizes, torch.Tensor):
image_height, image_width = target_sizes.unbind(1)
else:
raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
boxes = boxes * scale_factor
return boxes
@add_start_docstrings(
"Constructs a fast GroundingDino image processor.",
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
"""
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
""",
)
@requires(backends=("torchvision", "torch"))
class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
resample = PILImageResampling.BILINEAR
image_mean = IMAGENET_DEFAULT_MEAN
image_std = IMAGENET_DEFAULT_STD
format = AnnotationFormat.COCO_DETECTION
do_resize = True
do_rescale = True
do_normalize = True
do_pad = True
size = {"shortest_edge": 800, "longest_edge": 1333}
default_to_square = False
model_input_names = ["pixel_values", "pixel_mask"]
valid_kwargs = GroundingDinoFastImageProcessorKwargs
def __init__(self, **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs]) -> None:
if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
size = kwargs.pop("size", None)
if "max_size" in kwargs:
logger.warning_once(
"The `max_size` parameter is deprecated and will be removed in v4.26. "
"Please specify in `size['longest_edge'] instead`.",
)
max_size = kwargs.pop("max_size")
else:
max_size = None if size is None else 1333
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
# Backwards compatibility
do_convert_annotations = kwargs.get("do_convert_annotations", None)
do_normalize = kwargs.get("do_normalize", None)
if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
super().__init__(**kwargs)
@classmethod
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
"""
Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
created using from_dict and kwargs e.g. `GroundingDinoImageProcessorFast.from_pretrained(checkpoint, size=600,
max_size=800)`
"""
image_processor_dict = image_processor_dict.copy()
if "max_size" in kwargs:
image_processor_dict["max_size"] = kwargs.pop("max_size")
if "pad_and_return_pixel_mask" in kwargs:
image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
return super().from_dict(image_processor_dict, **kwargs)
def prepare_annotation(
self,
image: torch.Tensor,
target: Dict,
format: Optional[AnnotationFormat] = None,
return_segmentation_masks: Optional[bool] = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Dict:
"""
Prepare an annotation for feeding into GROUNDING_DINO model.
"""
format = format if format is not None else self.format
if format == AnnotationFormat.COCO_DETECTION:
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_detection_annotation(
image, target, return_segmentation_masks, input_data_format=input_data_format
)
elif format == AnnotationFormat.COCO_PANOPTIC:
return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_panoptic_annotation(
image,
target,
masks_path=masks_path,
return_masks=return_segmentation_masks,
input_data_format=input_data_format,
)
else:
raise ValueError(f"Format {format} is not supported.")
return target
def resize(
self,
image: torch.Tensor,
size: SizeDict,
interpolation: "F.InterpolationMode" = None,
**kwargs,
) -> torch.Tensor:
"""
Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
int, smaller edge of the image will be matched to this number.
Args:
image (`torch.Tensor`):
Image to resize.
size (`SizeDict`):
Size of the image's `(height, width)` dimensions after resizing. Available options are:
- `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
Do NOT keep the aspect ratio.
- `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
less or equal to `longest_edge`.
- `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
`max_width`.
interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
Resampling filter to use if resizing the image.
"""
interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
if size.shortest_edge and size.longest_edge:
# Resize the image so that the shortest edge or the longest edge is of the given size
# while maintaining the aspect ratio of the original image.
new_size = get_size_with_aspect_ratio(
image.size()[-2:],
size["shortest_edge"],
size["longest_edge"],
)
elif size.max_height and size.max_width:
new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
elif size.height and size.width:
new_size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = F.resize(
image,
size=new_size,
interpolation=interpolation,
**kwargs,
)
return image
def resize_annotation(
self,
annotation: Dict[str, Any],
orig_size: Tuple[int, int],
target_size: Tuple[int, int],
threshold: float = 0.5,
interpolation: "F.InterpolationMode" = None,
):
"""
Resizes an annotation to a target size.
Args:
annotation (`Dict[str, Any]`):
The annotation dictionary.
orig_size (`Tuple[int, int]`):
The original size of the input image.
target_size (`Tuple[int, int]`):
The target size of the image, as returned by the preprocessing `resize` step.
threshold (`float`, *optional*, defaults to 0.5):
The threshold used to binarize the segmentation masks.
resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
The resampling filter to use when resizing the masks.
"""
interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
new_annotation = {}
new_annotation["size"] = target_size
for key, value in annotation.items():
if key == "boxes":
boxes = value
scaled_boxes = boxes * torch.as_tensor(
[ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
)
new_annotation["boxes"] = scaled_boxes
elif key == "area":
area = value
scaled_area = area * (ratio_width * ratio_height)
new_annotation["area"] = scaled_area
elif key == "masks":
masks = value[:, None]
masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
masks = torch.stack(masks).to(torch.float32)
masks = masks[:, 0] > threshold
new_annotation["masks"] = masks
elif key == "size":
new_annotation["size"] = target_size
else:
new_annotation[key] = value
return new_annotation
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
image_height, image_width = image_size
norm_annotation = {}
for key, value in annotation.items():
if key == "boxes":
boxes = value
boxes = corners_to_center_format(boxes)
boxes /= torch.as_tensor(
[image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
)
norm_annotation[key] = boxes
else:
norm_annotation[key] = value
return norm_annotation
def _update_annotation_for_padded_image(
self,
annotation: Dict,
input_image_size: Tuple[int, int],
output_image_size: Tuple[int, int],
padding,
update_bboxes,
) -> Dict:
"""
Update the annotation for a padded image.
"""
new_annotation = {}
new_annotation["size"] = output_image_size
ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
for key, value in annotation.items():
if key == "masks":
masks = value
masks = F.pad(
masks,
padding,
fill=0,
)
masks = safe_squeeze(masks, 1)
new_annotation["masks"] = masks
elif key == "boxes" and update_bboxes:
boxes = value
boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
new_annotation["boxes"] = boxes
elif key == "size":
new_annotation["size"] = output_image_size
else:
new_annotation[key] = value
return new_annotation
def pad(
self,
image: torch.Tensor,
padded_size: Tuple[int, int],
annotation: Optional[Dict[str, Any]] = None,
update_bboxes: bool = True,
fill: int = 0,
):
original_size = image.size()[-2:]
padding_bottom = padded_size[0] - original_size[0]
padding_right = padded_size[1] - original_size[1]
if padding_bottom < 0 or padding_right < 0:
raise ValueError(
f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
f"original size. Got padded size: {padded_size}, original size: {original_size}."
)
if original_size != padded_size:
padding = [0, 0, padding_right, padding_bottom]
image = F.pad(image, padding, fill=fill)
if annotation is not None:
annotation = self._update_annotation_for_padded_image(
annotation, original_size, padded_size, padding, update_bboxes
)
# Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
pixel_mask[: original_size[0], : original_size[1]] = 1
return image, pixel_mask, annotation
@add_start_docstrings(
BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
"""
annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
List of annotations associated with the image or batch of images. If annotation is for object
detection, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
dictionary. An image can have no annotations, in which case the list should be empty.
If annotation is for segmentation, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
An image can have no segments, in which case the list should be empty.
- "file_name" (`str`): The file name of the image.
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
method. If `True`, padding will be applied to the bottom and right of the image with zeros.
If `pad_size` is provided, the image will be padded to the specified dimensions.
Otherwise, the image will be padded to the maximum height and width of the batch.
pad_size (`Dict[str, int]`, *optional*):
The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
height and width in the batch.
return_segmentation_masks (`bool`, *optional*, defaults to `False`):
Whether to return segmentation masks.
masks_path (`str` or `pathlib.Path`, *optional*):
Path to the directory containing the segmentation masks.
""",
)
def preprocess(
self,
images: ImageInput,
annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
**kwargs: Unpack[GroundingDinoFastImageProcessorKwargs],
) -> BatchFeature:
if "pad_and_return_pixel_mask" in kwargs:
kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
logger.warning_once(
"The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
"use `do_pad` instead."
)
if "max_size" in kwargs:
logger.warning_once(
"The `max_size` argument is deprecated and will be removed in a future version, use"
" `size['longest_edge']` instead."
)
kwargs["size"] = kwargs.pop("max_size")
return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
def _preprocess(
self,
images: List["torch.Tensor"],
annotations: Optional[Union[AnnotationType, List[AnnotationType]]],
return_segmentation_masks: bool,
masks_path: Optional[Union[str, pathlib.Path]],
do_resize: bool,
size: SizeDict,
interpolation: Optional["F.InterpolationMode"],
do_center_crop: bool,
crop_size: SizeDict,
do_rescale: bool,
rescale_factor: float,
do_normalize: bool,
do_convert_annotations: bool,
image_mean: Optional[Union[float, List[float]]],
image_std: Optional[Union[float, List[float]]],
do_pad: bool,
pad_size: Optional[Dict[str, int]],
format: Optional[Union[str, AnnotationFormat]],
return_tensors: Optional[Union[str, TensorType]],
) -> BatchFeature:
"""
Preprocess an image or a batch of images so that it can be used by the model.
"""
if annotations is not None and isinstance(annotations, dict):
annotations = [annotations]
if annotations is not None and len(images) != len(annotations):
raise ValueError(
f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
)
format = AnnotationFormat(format)
if annotations is not None:
validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
if (
masks_path is not None
and format == AnnotationFormat.COCO_PANOPTIC
and not isinstance(masks_path, (pathlib.Path, str))
):
raise ValueError(
"The path to the directory containing the mask PNG files should be provided as a"
f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
)
data = {}
processed_images = []
processed_annotations = []
pixel_masks = [] # Initialize pixel_masks here
for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
# prepare (COCO annotations as a list of Dict -> GROUNDING_DINO target as a single Dict per image)
if annotations is not None:
annotation = self.prepare_annotation(
image,
annotation,
format,
return_segmentation_masks=return_segmentation_masks,
masks_path=masks_path,
input_data_format=ChannelDimension.FIRST,
)
if do_resize:
resized_image = self.resize(image, size=size, interpolation=interpolation)
if annotations is not None:
annotation = self.resize_annotation(
annotation,
orig_size=image.size()[-2:],
target_size=resized_image.size()[-2:],
)
image = resized_image
# Fused rescale and normalize
image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
if do_convert_annotations and annotations is not None:
annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
processed_images.append(image)
processed_annotations.append(annotation)
images = processed_images
annotations = processed_annotations if annotations is not None else None
if do_pad:
# depends on all resized image shapes so we need another loop
if pad_size is not None:
padded_size = (pad_size["height"], pad_size["width"])
else:
padded_size = get_max_height_width(images)
padded_images = []
padded_annotations = []
for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
if padded_size == image.size()[-2:]:
padded_images.append(image)
pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
padded_annotations.append(annotation)
continue
image, pixel_mask, annotation = self.pad(
image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
)
padded_images.append(image)
padded_annotations.append(annotation)
pixel_masks.append(pixel_mask)
images = padded_images
annotations = padded_annotations if annotations is not None else None
data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
data.update({"pixel_values": torch.stack(images, dim=0)})
encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
return encoded_inputs
def post_process_object_detection(
self,
outputs: "GroundingDinoObjectDetectionOutput",
threshold: float = 0.1,
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
):
"""
Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format.
Args:
outputs ([`GroundingDinoObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.1):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
- "scores": The confidence scores for each predicted box on the image.
- "labels": Indexes of the classes predicted by the model on the image.
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
"""
batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
batch_size = len(batch_logits)
if target_sizes is not None and len(target_sizes) != batch_size:
raise ValueError("Make sure that you pass in as many target sizes as images")
# batch_logits of shape (batch_size, num_queries, num_classes)
batch_class_logits = torch.max(batch_logits, dim=-1)
batch_scores = torch.sigmoid(batch_class_logits.values)
batch_labels = batch_class_logits.indices
# Convert to [x0, y0, x1, y1] format
batch_boxes = center_to_corners_format(batch_boxes)
# Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None:
batch_boxes = _scale_boxes(batch_boxes, target_sizes)
results = []
for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
keep = scores > threshold
scores = scores[keep]
labels = labels[keep]
boxes = boxes[keep]
results.append({"scores": scores, "labels": labels, "boxes": boxes})
return results
__all__ = ["GroundingDinoImageProcessorFast"]

View File

@ -0,0 +1,127 @@
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
from ...image_transforms import center_to_corners_format
from ...utils import (
TensorType,
is_torch_available,
logging,
)
if TYPE_CHECKING:
from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
def _scale_boxes(boxes, target_sizes):
"""
Scale batch of bounding boxes to the target sizes.
Args:
boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
target_sizes (`List[Tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
Returns:
`torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
"""
if isinstance(target_sizes, (list, tuple)):
image_height = torch.tensor([i[0] for i in target_sizes])
image_width = torch.tensor([i[1] for i in target_sizes])
elif isinstance(target_sizes, torch.Tensor):
image_height, image_width = target_sizes.unbind(1)
else:
raise ValueError("`target_sizes` must be a list, tuple or torch.Tensor")
scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
boxes = boxes * scale_factor
return boxes
class GroundingDinoImageProcessorFast(DetrImageProcessorFast):
def post_process_object_detection(
self,
outputs: "GroundingDinoObjectDetectionOutput",
threshold: float = 0.1,
target_sizes: Optional[Union[TensorType, List[Tuple]]] = None,
):
"""
Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format.
Args:
outputs ([`GroundingDinoObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.1):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the following keys:
- "scores": The confidence scores for each predicted box on the image.
- "labels": Indexes of the classes predicted by the model on the image.
- "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
"""
batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
batch_size = len(batch_logits)
if target_sizes is not None and len(target_sizes) != batch_size:
raise ValueError("Make sure that you pass in as many target sizes as images")
# batch_logits of shape (batch_size, num_queries, num_classes)
batch_class_logits = torch.max(batch_logits, dim=-1)
batch_scores = torch.sigmoid(batch_class_logits.values)
batch_labels = batch_class_logits.indices
# Convert to [x0, y0, x1, y1] format
batch_boxes = center_to_corners_format(batch_boxes)
# Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None:
batch_boxes = _scale_boxes(batch_boxes, target_sizes)
results = []
for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
keep = scores > threshold
scores = scores[keep]
labels = labels[keep]
boxes = boxes[keep]
results.append({"scores": scores, "labels": labels, "boxes": boxes})
return results
def post_process():
raise NotImplementedError("Post-processing is not implemented for Grounding-Dino yet.")
def post_process_segmentation():
raise NotImplementedError("Segmentation post-processing is not implemented for Grounding-Dino yet.")
def post_process_instance():
raise NotImplementedError("Instance post-processing is not implemented for Grounding-Dino yet.")
def post_process_panoptic():
raise NotImplementedError("Panoptic post-processing is not implemented for Grounding-Dino yet.")
def post_process_instance_segmentation():
raise NotImplementedError("Segmentation post-processing is not implemented for Grounding-Dino yet.")
def post_process_semantic_segmentation():
raise NotImplementedError("Semantic segmentation post-processing is not implemented for Grounding-Dino yet.")
def post_process_panoptic_segmentation():
raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Grounding-Dino yet.")
__all__ = ["GroundingDinoImageProcessorFast"]

View File

@ -20,7 +20,7 @@ import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision, slow
from transformers.utils import is_torch_available, is_vision_available
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
@ -35,6 +35,9 @@ if is_vision_available():
from transformers import GroundingDinoImageProcessor
if is_torchvision_available():
from transformers import GroundingDinoImageProcessorFast
class GroundingDinoImageProcessingTester:
def __init__(
@ -147,6 +150,7 @@ class GroundingDinoImageProcessingTester:
@require_vision
class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None
fast_image_processing_class = GroundingDinoImageProcessorFast if is_torchvision_available() else None
def setUp(self):
super().setUp()
@ -182,20 +186,21 @@ class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessin
self.assertEqual(image_processor.do_pad, False)
def test_post_process_object_detection(self):
image_processor = self.image_processing_class(**self.image_processor_dict)
outputs = self.image_processor_tester.get_fake_grounding_dino_output()
results = image_processor.post_process_object_detection(outputs, threshold=0.0)
for image_processing_class in self.image_processor_list:
image_processor = image_processing_class(**self.image_processor_dict)
outputs = self.image_processor_tester.get_fake_grounding_dino_output()
results = image_processor.post_process_object_detection(outputs, threshold=0.0)
self.assertEqual(len(results), self.image_processor_tester.batch_size)
self.assertEqual(list(results[0].keys()), ["scores", "labels", "boxes"])
self.assertEqual(results[0]["boxes"].shape, (self.image_processor_tester.num_queries, 4))
self.assertEqual(results[0]["scores"].shape, (self.image_processor_tester.num_queries,))
self.assertEqual(len(results), self.image_processor_tester.batch_size)
self.assertEqual(list(results[0].keys()), ["scores", "labels", "boxes"])
self.assertEqual(results[0]["boxes"].shape, (self.image_processor_tester.num_queries, 4))
self.assertEqual(results[0]["scores"].shape, (self.image_processor_tester.num_queries,))
expected_scores = torch.tensor([0.7050, 0.7222, 0.7222, 0.6829, 0.7220])
torch.testing.assert_close(results[0]["scores"], expected_scores, rtol=1e-4, atol=1e-4)
expected_scores = torch.tensor([0.7050, 0.7222, 0.7222, 0.6829, 0.7220])
torch.testing.assert_close(results[0]["scores"], expected_scores, rtol=1e-4, atol=1e-4)
expected_box_slice = torch.tensor([0.6908, 0.4354, 1.0737, 1.3947])
torch.testing.assert_close(results[0]["boxes"][0], expected_box_slice, rtol=1e-4, atol=1e-4)
expected_box_slice = torch.tensor([0.6908, 0.4354, 1.0737, 1.3947])
torch.testing.assert_close(results[0]["boxes"][0], expected_box_slice, rtol=1e-4, atol=1e-4)
@slow
# Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_detection_annotations with DeformableDetr->GroundingDino