diff --git a/setup.py b/setup.py index 70aaa187620..2d3f45cf14c 100644 --- a/setup.py +++ b/setup.py @@ -466,7 +466,12 @@ setup( package_data={"": ["**/*.cu", "**/*.cpp", "**/*.cuh", "**/*.h", "**/*.pyx", "py.typed"]}, zip_safe=False, extras_require=extras, - entry_points={"console_scripts": ["transformers=transformers.commands.transformers_cli:main", "transformers-cli=transformers.commands.transformers_cli:main_cli"]}, + entry_points={ + "console_scripts": [ + "transformers=transformers.commands.transformers_cli:main", + "transformers-cli=transformers.commands.transformers_cli:main_cli", + ] + }, python_requires=">=3.9.0", install_requires=list(install_requires), classifiers=[ diff --git a/src/transformers/models/aria/image_processing_aria.py b/src/transformers/models/aria/image_processing_aria.py index d1a722e9054..44c6d40a4c6 100644 --- a/src/transformers/models/aria/image_processing_aria.py +++ b/src/transformers/models/aria/image_processing_aria.py @@ -18,12 +18,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import math from typing import Iterable, List, Optional, Tuple, Union import numpy as np -from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_patch_output_size, select_best_resolution from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format from ...image_utils import ( ChannelDimension, @@ -71,23 +70,6 @@ def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> Li return patches -def _get_patch_output_size(image, target_resolution, input_data_format): - original_height, original_width = get_image_size(image, channel_dim=input_data_format) - target_height, target_width = target_resolution - - scale_w = target_width / original_width - scale_h = target_height / original_height - - if scale_w < scale_h: - new_width = target_width - new_height = min(math.ceil(original_height * scale_w), target_height) - else: - new_height = target_height - new_width = min(math.ceil(original_width * scale_h), target_width) - - return new_height, new_width - - class AriaImageProcessor(BaseImageProcessor): """ A vision processor for the Aria model that handles image preprocessing. @@ -375,7 +357,7 @@ class AriaImageProcessor(BaseImageProcessor): Returns: np.array: The resized and padded image. """ - new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) # Resize the image resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) @@ -389,12 +371,12 @@ class AriaImageProcessor(BaseImageProcessor): Pad an image to a target resolution while maintaining aspect ratio. """ target_height, target_width = target_resolution - new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) - paste_x = (target_width - new_width) // 2 - paste_y = (target_height - new_height) // 2 + paste_x, r_x = divmod(target_width - new_width, 2) + paste_y, r_y = divmod(target_height - new_height, 2) - padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x))) return padded_image diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 51e203b07b2..e4d063f7827 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import math from typing import Dict, Iterable, List, Optional, Tuple, Union import numpy as np @@ -20,7 +19,7 @@ import numpy as np from ...activations import ACT2FN from ...configuration_utils import PretrainedConfig from ...generation import GenerationMixin -from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_patch_output_size, select_best_resolution from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format from ...image_utils import ( ChannelDimension, @@ -461,23 +460,6 @@ class AriaProjector(nn.Module): return out -def _get_patch_output_size(image, target_resolution, input_data_format): - original_height, original_width = get_image_size(image, channel_dim=input_data_format) - target_height, target_width = target_resolution - - scale_w = target_width / original_width - scale_h = target_height / original_height - - if scale_w < scale_h: - new_width = target_width - new_height = min(math.ceil(original_height * scale_w), target_height) - else: - new_height = target_height - new_width = min(math.ceil(original_width * scale_h), target_width) - - return new_height, new_width - - class AriaImageProcessor(BaseImageProcessor): """ A vision processor for the Aria model that handles image preprocessing. @@ -765,7 +747,7 @@ class AriaImageProcessor(BaseImageProcessor): Returns: np.array: The resized and padded image. """ - new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) # Resize the image resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) @@ -779,12 +761,12 @@ class AriaImageProcessor(BaseImageProcessor): Pad an image to a target resolution while maintaining aspect ratio. """ target_height, target_width = target_resolution - new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) - paste_x = (target_width - new_width) // 2 - paste_y = (target_height - new_height) // 2 + paste_x, r_x = divmod(target_width - new_width, 2) + paste_y, r_y = divmod(target_height - new_height, 2) - padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x))) return padded_image diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index e1afee31928..bf8920e955a 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -14,12 +14,17 @@ # limitations under the License. """Image processor class for LLaVa-NeXT.""" -import math from typing import Dict, Iterable, List, Optional, Tuple, Union import numpy as np -from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution +from ...image_processing_utils import ( + BaseImageProcessor, + BatchFeature, + get_patch_output_size, + get_size_dict, + select_best_resolution, +) from ...image_transforms import ( PaddingMode, convert_to_rgb, @@ -99,23 +104,6 @@ def expand_to_square(image: np.array, background_color, input_data_format) -> np return result -def _get_patch_output_size(image, target_resolution, input_data_format): - original_height, original_width = get_image_size(image, channel_dim=input_data_format) - target_height, target_width = target_resolution - - scale_w = target_width / original_width - scale_h = target_height / original_height - - if scale_w < scale_h: - new_width = target_width - new_height = min(math.ceil(original_height * scale_w), target_height) - else: - new_height = target_height - new_width = min(math.ceil(original_width * scale_h), target_width) - - return new_height, new_width - - class LlavaNextImageProcessor(BaseImageProcessor): r""" Constructs a LLaVa-NeXT image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques @@ -429,7 +417,7 @@ class LlavaNextImageProcessor(BaseImageProcessor): Returns: np.array: The resized and padded image. """ - new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) # Resize the image resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) @@ -443,12 +431,12 @@ class LlavaNextImageProcessor(BaseImageProcessor): Pad an image to a target resolution while maintaining aspect ratio. """ target_height, target_width = target_resolution - new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) - paste_x = (target_width - new_width) // 2 - paste_y = (target_height - new_height) // 2 + paste_x, r_x = divmod(target_width - new_width, 2) + paste_y, r_y = divmod(target_height - new_height, 2) - padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x))) return padded_image diff --git a/src/transformers/models/llava_next/image_processing_llava_next_fast.py b/src/transformers/models/llava_next/image_processing_llava_next_fast.py index d4caf2a19a2..1118db06e07 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next_fast.py +++ b/src/transformers/models/llava_next/image_processing_llava_next_fast.py @@ -102,8 +102,8 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast): A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list of the form `(height, width)`. do_pad (`bool`, *optional*): - Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest - number of patches in the batch. Padding will be applied to the bottom and right with zeros. + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. """, ) def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature: @@ -164,10 +164,10 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast): target_height, target_width = target_resolution new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) - paste_x = (target_width - new_width) // 2 - paste_y = (target_height - new_height) // 2 + paste_x, r_x = divmod(target_width - new_width, 2) + paste_y, r_y = divmod(target_height - new_height, 2) - padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x, paste_y]) + padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x + r_x, paste_y + r_y]) return padded_image diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 90cf7ea252b..fa7c04bdf40 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -139,14 +139,14 @@ def unpad_image(tensor, original_size): if original_aspect_ratio > current_aspect_ratio: scale_factor = current_width / original_width - new_height = int(round(original_height * scale_factor, 7)) - padding = (current_height - new_height) // 2 - unpadded_tensor = tensor[:, padding : current_height - padding, :] + new_height = min(math.ceil(original_height * scale_factor), current_height) + padding, r = divmod(current_height - new_height, 2) + unpadded_tensor = tensor[:, padding : current_height - (padding + r), :] else: scale_factor = current_height / original_height - new_width = int(round(original_width * scale_factor, 7)) - padding = (current_width - new_width) // 2 - unpadded_tensor = tensor[:, :, padding : current_width - padding] + new_width = min(math.ceil(original_width * scale_factor), current_width) + padding, r = divmod(current_width - new_width, 2) + unpadded_tensor = tensor[:, :, padding : current_width - (padding + r)] return unpadded_tensor diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index c89e2d72ef8..113441fd2aa 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -262,14 +262,14 @@ def unpad_image(tensor, original_size): if original_aspect_ratio > current_aspect_ratio: scale_factor = current_width / original_width - new_height = int(round(original_height * scale_factor, 7)) - padding = (current_height - new_height) // 2 - unpadded_tensor = tensor[:, padding : current_height - padding, :] + new_height = min(math.ceil(original_height * scale_factor), current_height) + padding, r = divmod(current_height - new_height, 2) + unpadded_tensor = tensor[:, padding : current_height - (padding + r), :] else: scale_factor = current_height / original_height - new_width = int(round(original_width * scale_factor, 7)) - padding = (current_width - new_width) // 2 - unpadded_tensor = tensor[:, :, padding : current_width - padding] + new_width = min(math.ceil(original_width * scale_factor), current_width) + padding, r = divmod(current_width - new_width, 2) + unpadded_tensor = tensor[:, :, padding : current_width - (padding + r)] return unpadded_tensor diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py index 8cfdfee1f4a..502e9ac74c4 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py @@ -14,12 +14,17 @@ # limitations under the License. """Image processor class for LLaVa-Onevision.""" -import math from typing import Dict, Iterable, List, Optional, Tuple, Union import numpy as np -from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution +from ...image_processing_utils import ( + BaseImageProcessor, + BatchFeature, + get_patch_output_size, + get_size_dict, + select_best_resolution, +) from ...image_transforms import ( PaddingMode, convert_to_rgb, @@ -99,24 +104,6 @@ def expand_to_square(image: np.array, background_color, input_data_format) -> np return result -# Copied from transformers.models.llava_next.image_processing_llava_next._get_patch_output_size -def _get_patch_output_size(image, target_resolution, input_data_format): - original_height, original_width = get_image_size(image, channel_dim=input_data_format) - target_height, target_width = target_resolution - - scale_w = target_width / original_width - scale_h = target_height / original_height - - if scale_w < scale_h: - new_width = target_width - new_height = min(math.ceil(original_height * scale_w), target_height) - else: - new_height = target_height - new_width = min(math.ceil(original_width * scale_h), target_width) - - return new_height, new_width - - class LlavaOnevisionImageProcessor(BaseImageProcessor): r""" Constructs a LLaVa-Onevision image processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame. @@ -151,8 +138,8 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor): number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. Can be overridden by the `image_std` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): - Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest - number of patches in the batch. Padding will be applied to the bottom and right with zeros. + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. do_convert_rgb (`bool`, *optional*, defaults to `True`): Whether to convert the image to RGB. """ @@ -321,7 +308,7 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor): Returns: np.array: The resized and padded image. """ - new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) # Resize the image resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) @@ -336,12 +323,12 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor): Pad an image to a target resolution while maintaining aspect ratio. """ target_height, target_width = target_resolution - new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) - paste_x = (target_width - new_width) // 2 - paste_y = (target_height - new_height) // 2 + paste_x, r_x = divmod(target_width - new_width, 2) + paste_y, r_y = divmod(target_height - new_height, 2) - padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + padded_image = self.pad(image, padding=((paste_y, paste_y + r_y), (paste_x, paste_x + r_x))) return padded_image diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py index 598ac78f538..f5e2da2cd9e 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py @@ -84,8 +84,8 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast): A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list of the form `(height, width)`. do_pad (`bool`, *optional*): - Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest - number of patches in the batch. Padding will be applied to the bottom and right with zeros. + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. """, ) def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature: @@ -146,10 +146,10 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast): target_height, target_width = target_resolution new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) - paste_x = (target_width - new_width) // 2 - paste_y = (target_height - new_height) // 2 + paste_x, r_x = divmod(target_width - new_width, 2) + paste_y, r_y = divmod(target_height - new_height, 2) - padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x, paste_y]) + padded_image = F.pad(image, padding=[paste_x, paste_y, paste_x + r_x, paste_y + r_y]) return padded_image diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index dfd43643958..be67df9b3af 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -140,14 +140,14 @@ def unpad_image(tensor, original_size): if original_aspect_ratio > current_aspect_ratio: scale_factor = current_width / original_width - new_height = int(round(original_height * scale_factor, 7)) - padding = (current_height - new_height) // 2 - unpadded_tensor = tensor[:, padding : current_height - padding, :] + new_height = min(math.ceil(original_height * scale_factor), current_height) + padding, r = divmod(current_height - new_height, 2) + unpadded_tensor = tensor[:, padding : current_height - (padding + r), :] else: scale_factor = current_height / original_height - new_width = int(round(original_width * scale_factor, 7)) - padding = (current_width - new_width) // 2 - unpadded_tensor = tensor[:, :, padding : current_width - padding] + new_width = min(math.ceil(original_width * scale_factor), current_width) + padding, r = divmod(current_width - new_width, 2) + unpadded_tensor = tensor[:, :, padding : current_width - (padding + r)] return unpadded_tensor diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index 753abc924ac..a5acd5b320e 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -70,6 +70,8 @@ class LlavaOnevisionProcessor(ProcessorMixin): Special token used to denote image location. video_token (`str`, *optional*, defaults to `"