added fast image processor for ZoeDepth and expanded tests accordingly (#38515)

* added fast image processor for ZoeDepth and expanded tests accordingly * added fast image processor for ZoeDepth and expanded tests accordingly, hopefully fixed repo consistency issue too now * final edits for zoedept fast image processor * final minor edit for zoedepth fast imate procesor
2025-07-15 10:38:23 +06:00 · 2025-06-05 00:59:17 +02:00 · 2025-06-05 00:59:17 +02:00 · 1fed6166c0
commit 1fed6166c0
parent a510be20f3
5 changed files with 435 additions and 36 deletions
--- a/docs/source/en/model_doc/zoedepth.md
+++ b/docs/source/en/model_doc/zoedepth.md
@ -119,6 +119,11 @@ Image.fromarray(depth.astype("uint8"))
 [[autodoc]] ZoeDepthImageProcessor
    - preprocess

+## ZoeDepthImageProcessorFast
+
+[[autodoc]] ZoeDepthImageProcessorFast
+    - preprocess
+
 ## ZoeDepthForDepthEstimation

 [[autodoc]] ZoeDepthForDepthEstimation
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@ -170,7 +170,7 @@ else:
            ("vitmatte", ("VitMatteImageProcessor", "VitMatteImageProcessorFast")),
            ("xclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
            ("yolos", ("YolosImageProcessor", "YolosImageProcessorFast")),
-            ("zoedepth", ("ZoeDepthImageProcessor",)),
+            ("zoedepth", ("ZoeDepthImageProcessor", "ZoeDepthImageProcessorFast")),
        ]
    )

--- a/src/transformers/models/zoedepth/init.py
+++ b/src/transformers/models/zoedepth/init.py
@ -20,6 +20,7 @@ from ...utils.import_utils import define_import_structure
 if TYPE_CHECKING:
    from .configuration_zoedepth import *
    from .image_processing_zoedepth import *
+    from .image_processing_zoedepth_fast import *
    from .modeling_zoedepth import *
 else:
    import sys
--- a/src/transformers/models/zoedepth/image_processing_zoedepth_fast.py
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth_fast.py
@ -0,0 +1,328 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for ZoeDepth."""
+
+from typing import (
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+import numpy as np
+
+from ...image_processing_utils import (
+    BatchFeature,
+)
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    get_image_size,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+    requires_backends,
+)
+from .image_processing_zoedepth import get_resize_output_image_size
+from .modeling_zoedepth import ZoeDepthDepthEstimatorOutput
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+    from torchvision.transforms import InterpolationMode
+
+
+logger = logging.get_logger(__name__)
+
+
+class ZoeDepthFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    do_pad (`bool`, *optional*, defaults to `True`):
+        Whether to apply pad the input.
+    keep_aspect_ratio (`bool`, *optional*, defaults to `True`):
+        If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it
+        for both dimensions. This ensures that the image is scaled down as little as possible while still fitting
+        within the desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a
+        size that is a multiple of this value by flooring the height and width to the nearest multiple of this value.
+        Can be overridden by `keep_aspect_ratio` in `preprocess`.
+    ensure_multiple_of (`int`, *optional*, defaults to 32):
+        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring
+        the height and width to the nearest multiple of this value.
+        Works both with and without `keep_aspect_ratio` being set to `True`.
+        Can be overridden by `ensure_multiple_of` in `preprocess`.
+    """
+
+    do_pad: Optional[bool]
+    keep_aspect_ratio: Optional[bool]
+    ensure_multiple_of: Optional[int]
+
+
+@auto_docstring
+class ZoeDepthImageProcessorFast(BaseImageProcessorFast):
+    do_pad = True
+    do_rescale = True
+    do_normalize = True
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    do_resize = True
+    size = {"height": 384, "width": 512}
+    resample = PILImageResampling.BILINEAR
+    keep_aspect_ratio = True
+    ensure_multiple_of = 1 / 32
+    valid_kwargs = ZoeDepthFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[ZoeDepthFastImageProcessorKwargs]) -> None:
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        **kwargs: Unpack[ZoeDepthFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def resize(
+        self,
+        images: "torch.Tensor",
+        size: SizeDict,
+        keep_aspect_ratio: bool = False,
+        ensure_multiple_of: int = 1,
+        interpolation: Optional["F.InterpolationMode"] = None,
+    ) -> "torch.Tensor":
+        """
+        Resize an image or batchd images to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
+        is resized to the largest possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is
+        set, the image is resized to a size that is a multiple of this value.
+
+        Args:
+            images (`torch.Tensor`):
+                Images to resize.
+            size (`Dict[str, int]`):
+                Target size of the output image.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+                If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
+            ensure_multiple_of (`int`, *optional*, defaults to 1):
+                The image is resized to a size that is a multiple of this value.
+            interpolation (`F.InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                Defines the resampling filter to use if resizing the image. Otherwise, the image is resized to size
+                specified in `size`.
+        """
+        if not size.height or not size.width:
+            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size}")
+        output_size = get_resize_output_image_size(
+            images,
+            output_size=(size.height, size.width),
+            keep_aspect_ratio=keep_aspect_ratio,
+            multiple=ensure_multiple_of,
+            input_data_format=ChannelDimension.FIRST,
+        )
+        height, width = output_size
+
+        resized_images = torch.nn.functional.interpolate(
+            images, (int(height), int(width)), mode=interpolation.value, align_corners=True
+        )
+
+        return resized_images
+
+    def _pad_images(
+        self,
+        images: "torch.Tensor",
+    ):
+        """
+        Args:
+            image (`torch.Tensor`):
+                Image to pad.
+        """
+        height, width = get_image_size(images, channel_dim=ChannelDimension.FIRST)
+
+        pad_height = int(np.sqrt(height / 2) * 3)
+        pad_width = int(np.sqrt(width / 2) * 3)
+
+        return F.pad(images, padding=(pad_width, pad_height), padding_mode="reflect")
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        keep_aspect_ratio: Optional[bool],
+        ensure_multiple_of: Optional[int],
+        interpolation: Optional["F.InterpolationMode"],
+        do_pad: bool,
+        do_rescale: bool,
+        rescale_factor: Optional[float],
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_rescale:
+                stacked_images = self.rescale(stacked_images, rescale_factor)
+            if do_pad:
+                stacked_images = self._pad_images(images=stacked_images)
+            if do_resize:
+                stacked_images = self.resize(
+                    stacked_images, size, keep_aspect_ratio, ensure_multiple_of, interpolation
+                )
+            if do_normalize:
+                stacked_images = self.normalize(stacked_images, image_mean, image_std)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        processed_images = torch.stack(resized_images, dim=0) if return_tensors else resized_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(
+        self,
+        outputs: "ZoeDepthDepthEstimatorOutput",
+        source_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+        outputs_flipped: Optional[Union["ZoeDepthDepthEstimatorOutput", None]] = None,
+        do_remove_padding: Optional[Union[bool, None]] = None,
+    ) -> List[Dict[str, TensorType]]:
+        """
+        Converts the raw output of [`ZoeDepthDepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`ZoeDepthDepthEstimatorOutput`]):
+                Raw outputs of the model.
+            source_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the source size
+                (height, width) of each image in the batch before preprocessing. This argument should be dealt as
+                "required" unless the user passes `do_remove_padding=False` as input to this function.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            outputs_flipped ([`ZoeDepthDepthEstimatorOutput`], *optional*):
+                Raw outputs of the model from flipped input (averaged out in the end).
+            do_remove_padding (`bool`, *optional*):
+                By default ZoeDepth adds padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the
+                boundary artifacts in the output depth map, so we need remove this padding during post_processing. The
+                parameter exists here in case the user changed the image preprocessing to not include padding.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+
+        if (outputs_flipped is not None) and (predicted_depth.shape != outputs_flipped.predicted_depth.shape):
+            raise ValueError("Make sure that `outputs` and `outputs_flipped` have the same shape")
+
+        if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
+            )
+
+        if do_remove_padding is None:
+            do_remove_padding = self.do_pad
+
+        if source_sizes is None and do_remove_padding:
+            raise ValueError(
+                "Either `source_sizes` should be passed in, or `do_remove_padding` should be set to False"
+            )
+
+        if (source_sizes is not None) and (len(predicted_depth) != len(source_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many source image sizes as the batch dimension of the logits"
+            )
+
+        if outputs_flipped is not None:
+            predicted_depth = (predicted_depth + torch.flip(outputs_flipped.predicted_depth, dims=[-1])) / 2
+
+        predicted_depth = predicted_depth.unsqueeze(1)
+
+        # Zoe Depth model adds padding around the images to fix the boundary artifacts in the output depth map
+        # The padding length is `int(np.sqrt(img_h/2) * fh)` for the height and similar for the width
+        # fh (and fw respectively) are equal to '3' by default
+        # Check [here](https://github.com/isl-org/ZoeDepth/blob/edb6daf45458569e24f50250ef1ed08c015f17a7/zoedepth/models/depth_model.py#L57)
+        # for the original implementation.
+        # In this section, we remove this padding to get the final depth image and depth prediction
+        padding_factor_h = padding_factor_w = 3
+
+        results = []
+        target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
+        source_sizes = [None] * len(predicted_depth) if source_sizes is None else source_sizes
+        for depth, target_size, source_size in zip(predicted_depth, target_sizes, source_sizes):
+            # depth.shape = [1, H, W]
+            if source_size is not None:
+                pad_h = pad_w = 0
+
+                if do_remove_padding:
+                    pad_h = int(np.sqrt(source_size[0] / 2) * padding_factor_h)
+                    pad_w = int(np.sqrt(source_size[1] / 2) * padding_factor_w)
+
+                depth = F.resize(
+                    depth,
+                    size=[source_size[0] + 2 * pad_h, source_size[1] + 2 * pad_w],
+                    interpolation=InterpolationMode.BICUBIC,
+                    antialias=False,
+                )
+
+                if pad_h > 0:
+                    depth = depth[:, pad_h:-pad_h, :]
+                if pad_w > 0:
+                    depth = depth[:, :, pad_w:-pad_w]
+
+            if target_size is not None:
+                target_size = [target_size[0], target_size[1]]
+                depth = F.resize(
+                    depth,
+                    size=target_size,
+                    interpolation=InterpolationMode.BICUBIC,
+                    antialias=False,
+                )
+            depth = depth.squeeze(0)
+            # depth.shape = [H, W]
+            results.append({"predicted_depth": depth})
+
+        return results
+
+
+__all__ = ["ZoeDepthImageProcessorFast"]
--- a/tests/models/zoedepth/test_image_processing_zoedepth.py
+++ b/tests/models/zoedepth/test_image_processing_zoedepth.py
@ -14,18 +14,30 @@


 import unittest
+from dataclasses import dataclass

 import numpy as np

-from transformers.file_utils import is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available

 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs


+if is_torch_available():
+    import torch
+
 if is_vision_available():
    from transformers import ZoeDepthImageProcessor

+    if is_torchvision_available():
+        from transformers import ZoeDepthImageProcessorFast
+
+
+@dataclass
+class ZoeDepthDepthOutputProxy:
+    predicted_depth: torch.FloatTensor = None
+

 class ZoeDepthImageProcessingTester:
    def __init__(
@ -43,7 +55,7 @@ class ZoeDepthImageProcessingTester:
        do_normalize=True,
        image_mean=[0.5, 0.5, 0.5],
        image_std=[0.5, 0.5, 0.5],
-        do_pad=False,
+        do_pad=True,
    ):
        size = size if size is not None else {"height": 18, "width": 18}
        self.parent = parent
@ -87,11 +99,25 @@ class ZoeDepthImageProcessingTester:
            torchify=torchify,
        )

+    def prepare_depth_outputs(self):
+        depth_tensors = prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=1,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=True,
+            torchify=True,
+        )
+        depth_tensors = [depth_tensor.squeeze(0) for depth_tensor in depth_tensors]
+        stacked_depth_tensors = torch.stack(depth_tensors, dim=0)
+        return ZoeDepthDepthOutputProxy(predicted_depth=stacked_depth_tensors)
+

@require_torch
@require_vision
 class ZoeDepthImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
    image_processing_class = ZoeDepthImageProcessor if is_vision_available() else None
+    fast_image_processing_class = ZoeDepthImageProcessorFast if is_torchvision_available() else None

    def setUp(self):
        super().setUp()
@ -115,10 +141,14 @@ class ZoeDepthImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        self.assertTrue(hasattr(image_processing, "do_pad"))

    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
            self.assertEqual(image_processor.size, {"height": 18, "width": 18})

-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        for image_processing_class in self.image_processor_list:
+            modified_dict = self.image_processor_dict
+            modified_dict["size"] = 42
+            image_processor = image_processing_class(**modified_dict)
            self.assertEqual(image_processor.size, {"height": 42, "width": 42})

    def test_ensure_multiple_of(self):
@ -127,7 +157,8 @@ class ZoeDepthImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):

        size = {"height": 380, "width": 513}
        multiple = 32
-        image_processor = ZoeDepthImageProcessor(
+        for image_processor_class in self.image_processor_list:
+            image_processor = image_processor_class(
                do_pad=False, ensure_multiple_of=multiple, size=size, keep_aspect_ratio=False
            )
            pixel_values = image_processor(image, return_tensors="pt").pixel_values
@ -142,7 +173,8 @@ class ZoeDepthImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        height, width = 512, 512
        size = {"height": height, "width": width}
        multiple = 32
-        image_processor = ZoeDepthImageProcessor(
+        for image_processor_class in self.image_processor_list:
+            image_processor = image_processor_class(
                do_pad=False, ensure_multiple_of=multiple, size=size, keep_aspect_ratio=False
            )
            pixel_values = image_processor(image, return_tensors="pt").pixel_values
@ -157,14 +189,18 @@ class ZoeDepthImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
        image = np.zeros((height, width, 3))

        size = {"height": 512, "width": 512}
-        image_processor = ZoeDepthImageProcessor(do_pad=False, keep_aspect_ratio=True, size=size, ensure_multiple_of=1)
+        for image_processor_class in self.image_processor_list:
+            image_processor = image_processor_class(
+                do_pad=False, keep_aspect_ratio=True, size=size, ensure_multiple_of=1
+            )
            pixel_values = image_processor(image, return_tensors="pt").pixel_values

            # As can be seen, the image is resized to the maximum size that fits in the specified size
            self.assertEqual(list(pixel_values.shape), [1, 3, 512, 670])

        # Test `keep_aspect_ratio=False` by turning off all other variables which affect the size
-        image_processor = ZoeDepthImageProcessor(
+        for image_processor_class in self.image_processor_list:
+            image_processor = image_processor_class(
                do_pad=False, keep_aspect_ratio=False, size=size, ensure_multiple_of=1
            )
            pixel_values = image_processor(image, return_tensors="pt").pixel_values
@ -177,10 +213,39 @@ class ZoeDepthImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):

        size = {"height": 511, "width": 511}
        multiple = 32
-        image_processor = ZoeDepthImageProcessor(size=size, keep_aspect_ratio=True, ensure_multiple_of=multiple)
+        for image_processor_class in self.image_processor_list:
+            image_processor = image_processor_class(size=size, keep_aspect_ratio=True, ensure_multiple_of=multiple)

            pixel_values = image_processor(image, return_tensors="pt").pixel_values

            self.assertEqual(list(pixel_values.shape), [1, 3, 512, 672])
            self.assertTrue(pixel_values.shape[2] % multiple == 0)
            self.assertTrue(pixel_values.shape[3] % multiple == 0)
+
+    # extend this test to check if removal of padding works fine!
+    def test_post_processing_equivalence(self):
+        outputs = self.image_processor_tester.prepare_depth_outputs()
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+
+        source_sizes = [outputs.predicted_depth.shape[1:]] * self.image_processor_tester.batch_size
+        target_sizes = [
+            torch.Size([outputs.predicted_depth.shape[1] // 2, *(outputs.predicted_depth.shape[2:])])
+        ] * self.image_processor_tester.batch_size
+
+        processed_fast = image_processor_fast.post_process_depth_estimation(
+            outputs,
+            source_sizes=source_sizes,
+            target_sizes=target_sizes,
+        )
+        processed_slow = image_processor_slow.post_process_depth_estimation(
+            outputs,
+            source_sizes=source_sizes,
+            target_sizes=target_sizes,
+        )
+        for pred_fast, pred_slow in zip(processed_fast, processed_slow):
+            depth_fast = pred_fast["predicted_depth"]
+            depth_slow = pred_slow["predicted_depth"]
+
+            torch.testing.assert_close(depth_fast, depth_slow, atol=1e-1, rtol=1e-3)
+            self.assertLessEqual(torch.mean(torch.abs(depth_fast.float() - depth_slow.float())).item(), 5e-3)