Add padding image transformation (#19838)

* Add padding transformation * Add in upstream changes * Update tests & docs * Code formatting tuples in docstring
2025-08-01 18:51:14 +06:00 · 2022-11-18 11:27:21 +00:00 · 2022-11-18 11:27:21 +00:00 · b98269425e
commit b98269425e
parent c29a2f7c9c
3 changed files with 225 additions and 1 deletions
--- a/docs/source/en/internal/image_processing_utils.mdx
+++ b/docs/source/en/internal/image_processing_utils.mdx
@ -29,6 +29,8 @@ Most of those are only useful if you are studying the code of the image processo
 [[autodoc]] image_transforms.normalize
 [[autodoc]] image_transforms.pad
 [[autodoc]] image_transforms.rgb_to_id
 [[autodoc]] image_transforms.rescale
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@ -18,7 +18,7 @@ from typing import Iterable, List, Optional, Tuple, Union
 import numpy as np
-from transformers.utils import TensorType
+from transformers.utils import ExplicitEnum, TensorType
 from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available, is_vision_available
@ -569,3 +569,100 @@ def id_to_rgb(id_map):
        color.append(id_map % 256)
        id_map //= 256
    return color
 class PaddingMode(ExplicitEnum):
    """
    Enum class for the different padding modes to use when padding images.
    """
    CONSTANT = "constant"
    REFLECT = "reflect"
    REPLICATE = "replicate"
    SYMMETRIC = "symmetric"
 def pad(
    image: np.ndarray,
    padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
    mode: PaddingMode = PaddingMode.CONSTANT,
    constant_values: Union[float, Iterable[float]] = 0.0,
    data_format: Optional[Union[str, ChannelDimension]] = None,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> np.ndarray:
    """
    Pads the `image` with the specified (height, width) `padding` and `mode`.
    Args:
        image (`np.ndarray`):
            The image to pad.
        padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
            Padding to apply to the edges of the height, width axes. Can be one of three formats:
            - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
            - `((before, after),)` yields same before and after pad for height and width.
            - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
        mode (`PaddingMode`):
            The padding mode to use. Can be one of:
                - `"constant"`: pads with a constant value.
                - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
                  vector along each axis.
                - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
                - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
        constant_values (`float` or `Iterable[float]`, *optional*):
            The value to use for the padding if `mode` is `"constant"`.
        data_format (`str` or `ChannelDimension`, *optional*):
            The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            If unset, will use same as the input image.
        input_data_format (`str` or `ChannelDimension`, *optional*):
            The channel dimension format for the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            If unset, will use the inferred format of the input image.
    Returns:
        `np.ndarray`: The padded image.
    """
    if input_data_format is None:
        input_data_format = infer_channel_dimension_format(image)
    def _expand_for_data_format(values):
        """
        Convert values to be in the format expected by np.pad based on the data format.
        """
        if isinstance(values, (int, float)):
            values = ((values, values), (values, values))
        elif isinstance(values, tuple) and len(values) == 1:
            values = ((values[0], values[0]), (values[0], values[0]))
        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int):
            values = (values, values)
        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple):
            values = values
        else:
            raise ValueError(f"Unsupported format: {values}")
        # add 0 for channel dimension
        values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0))
        # Add additional padding if there's a batch dimension
        values = (0, *values) if image.ndim == 4 else values
        return values
    padding = _expand_for_data_format(padding)
    if mode == PaddingMode.CONSTANT:
        constant_values = _expand_for_data_format(constant_values)
        image = np.pad(image, padding, mode="constant", constant_values=constant_values)
    elif mode == PaddingMode.REFLECT:
        image = np.pad(image, padding, mode="reflect")
    elif mode == PaddingMode.REPLICATE:
        image = np.pad(image, padding, mode="edge")
    elif mode == PaddingMode.SYMMETRIC:
        image = np.pad(image, padding, mode="symmetric")
    else:
        raise ValueError(f"Invalid padding mode: {mode}")
    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
    return image
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@ -41,6 +41,7 @@ if is_vision_available():
        get_resize_output_image_size,
        id_to_rgb,
        normalize,
        pad,
        resize,
        rgb_to_id,
        to_channel_dimension_format,
@ -289,3 +290,127 @@ class ImageTransformsTester(unittest.TestCase):
            ]
        )
        self.assertTrue(np.allclose(id_to_rgb(id_array), color))
    def test_pad(self):
        # fmt: off
        image = np.array([[
            [0, 1],
            [2, 3],
        ]])
        # fmt: on
        # Test that exception is raised if unknown padding mode is specified
        with self.assertRaises(ValueError):
            pad(image, 10, mode="unknown")
        # Test that exception is raised if invalid padding is specified
        with self.assertRaises(ValueError):
            # Cannot pad on channel dimension
            pad(image, (5, 10, 10))
        # Test image is padded equally on all sides is padding is an int
        # fmt: off
        expected_image = np.array([
            [[0, 0, 0, 0],
             [0, 0, 1, 0],
             [0, 2, 3, 0],
             [0, 0, 0, 0]],
        ])
        # fmt: on
        self.assertTrue(np.allclose(expected_image, pad(image, 1)))
        # Test the left and right of each axis is padded (pad_left, pad_right)
        # fmt: off
        expected_image = np.array(
            [[0, 0, 0, 0, 0],
             [0, 0, 0, 0, 0],
             [0, 0, 0, 1, 0],
             [0, 0, 2, 3, 0],
             [0, 0, 0, 0, 0]])
        # fmt: on
        self.assertTrue(np.allclose(expected_image, pad(image, (2, 1))))
        # Test only one axis is padded (pad_left, pad_right)
        # fmt: off
        expected_image = np.array([[
            [9, 9],
            [9, 9],
            [0, 1],
            [2, 3],
            [9, 9]
        ]])
        # fmt: on
        self.assertTrue(np.allclose(expected_image, pad(image, ((2, 1), (0, 0)), constant_values=9)))
        # Test padding with a constant value
        # fmt: off
        expected_image = np.array([[
            [8, 8, 0, 1, 9],
            [8, 8, 2, 3, 9],
            [8, 8, 7, 7, 9],
            [8, 8, 7, 7, 9]
        ]])
        # fmt: on
        self.assertTrue(np.allclose(expected_image, pad(image, ((0, 2), (2, 1)), constant_values=((6, 7), (8, 9)))))
        # fmt: off
        image = np.array([[
            [0, 1, 2],
            [3, 4, 5],
            [6, 7, 8],
        ]])
        # fmt: on
        # Test padding with PaddingMode.REFLECT
        # fmt: off
        expected_image = np.array([[
            [2, 1, 0, 1, 2, 1],
            [5, 4, 3, 4, 5, 4],
            [8, 7, 6, 7, 8, 7],
            [5, 4, 3, 4, 5, 4],
            [2, 1, 0, 1, 2, 1],
        ]])
        # fmt: on
        self.assertTrue(np.allclose(expected_image, pad(image, ((0, 2), (2, 1)), mode="reflect")))
        # Test padding with PaddingMode.REPLICATE
        # fmt: off
        expected_image = np.array([[
            [0, 0, 0, 1, 2, 2],
            [3, 3, 3, 4, 5, 5],
            [6, 6, 6, 7, 8, 8],
            [6, 6, 6, 7, 8, 8],
            [6, 6, 6, 7, 8, 8],
        ]])
        # fmt: on
        self.assertTrue(np.allclose(expected_image, pad(image, ((0, 2), (2, 1)), mode="replicate")))
        # Test padding with PaddingMode.SYMMETRIC
        # fmt: off
        expected_image = np.array([[
            [1, 0, 0, 1, 2, 2],
            [4, 3, 3, 4, 5, 5],
            [7, 6, 6, 7, 8, 8],
            [7, 6, 6, 7, 8, 8],
            [4, 3, 3, 4, 5, 5],
        ]])
        # fmt: on
        self.assertTrue(np.allclose(expected_image, pad(image, ((0, 2), (2, 1)), mode="symmetric")))
        # Test we can specify the output data format
        # Test padding with PaddingMode.REFLECT
        # fmt: off
        image = np.array([[
            [0, 1],
            [2, 3],
        ]])
        expected_image = np.array([
            [[0], [1], [0], [1], [0]],
            [[2], [3], [2], [3], [2]],
            [[0], [1], [0], [1], [0]],
            [[2], [3], [2], [3], [2]]
        ])
        # fmt: on
        self.assertTrue(
            np.allclose(expected_image, pad(image, ((0, 2), (2, 1)), mode="reflect", data_format="channels_last"))
        )