mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 03:01:07 +06:00
Fix processor kwargs qwen2 vl (#36890)
* Fix qwen2_vl and qwen2_5_vl processors cutom images kwargs * change version warning
This commit is contained in:
parent
48385aa4f4
commit
91455c1825
@ -493,7 +493,7 @@ class AutoImageProcessor:
|
|||||||
image_processor_auto_map = config.auto_map["AutoImageProcessor"]
|
image_processor_auto_map = config.auto_map["AutoImageProcessor"]
|
||||||
|
|
||||||
image_processor_class = None
|
image_processor_class = None
|
||||||
# TODO: @yoni, change logic in v4.50 (when use_fast set to True by default)
|
# TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
|
||||||
if image_processor_type is not None:
|
if image_processor_type is not None:
|
||||||
# if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
|
# if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
|
||||||
if use_fast is None:
|
if use_fast is None:
|
||||||
@ -501,7 +501,7 @@ class AutoImageProcessor:
|
|||||||
if not use_fast:
|
if not use_fast:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
|
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
|
||||||
"`use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. "
|
"`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
|
||||||
"This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
|
"This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
|
||||||
)
|
)
|
||||||
# Update class name to reflect the use_fast option. If class is not found, we fall back to the slow version.
|
# Update class name to reflect the use_fast option. If class is not found, we fall back to the slow version.
|
||||||
|
@ -41,7 +41,7 @@ from transformers.models.qwen2_vl.modeling_qwen2_vl import (
|
|||||||
VisionRotaryEmbedding,
|
VisionRotaryEmbedding,
|
||||||
VisionSdpaAttention,
|
VisionSdpaAttention,
|
||||||
)
|
)
|
||||||
from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
|
from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLImagesKwargs, Qwen2VLProcessor
|
||||||
|
|
||||||
from ...activations import ACT2FN
|
from ...activations import ACT2FN
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
@ -816,7 +816,12 @@ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
|
|||||||
fps: Union[List[float], float]
|
fps: Union[List[float], float]
|
||||||
|
|
||||||
|
|
||||||
|
class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
|
class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
images_kwargs: Qwen2_5_VLImagesKwargs
|
||||||
videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
|
videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
|
||||||
_defaults = {
|
_defaults = {
|
||||||
"text_kwargs": {
|
"text_kwargs": {
|
||||||
|
@ -23,11 +23,11 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from typing import List, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from ...feature_extraction_utils import BatchFeature
|
from ...feature_extraction_utils import BatchFeature
|
||||||
from ...image_utils import ImageInput, VideoInput
|
from ...image_utils import ImageInput, VideoInput
|
||||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
|
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
|
||||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||||
|
|
||||||
|
|
||||||
@ -35,7 +35,16 @@ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
|
|||||||
fps: Union[List[float], float]
|
fps: Union[List[float], float]
|
||||||
|
|
||||||
|
|
||||||
|
class Qwen2_5_VLImagesKwargs(ImagesKwargs):
|
||||||
|
min_pixels: Optional[int]
|
||||||
|
max_pixels: Optional[int]
|
||||||
|
patch_size: Optional[int]
|
||||||
|
temporal_patch_size: Optional[int]
|
||||||
|
merge_size: Optional[int]
|
||||||
|
|
||||||
|
|
||||||
class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
|
class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
images_kwargs: Qwen2_5_VLImagesKwargs
|
||||||
videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
|
videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
|
||||||
_defaults = {
|
_defaults = {
|
||||||
"text_kwargs": {
|
"text_kwargs": {
|
||||||
|
@ -384,7 +384,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
|
|||||||
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||||
min_pixels = size["shortest_edge"]
|
min_pixels = size["shortest_edge"]
|
||||||
else:
|
else:
|
||||||
size = self.size
|
size = {**self.size}
|
||||||
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
||||||
if min_pixels is not None:
|
if min_pixels is not None:
|
||||||
size["shortest_edge"] = min_pixels
|
size["shortest_edge"] = min_pixels
|
||||||
|
@ -339,7 +339,7 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
|
|||||||
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
||||||
min_pixels = size["shortest_edge"]
|
min_pixels = size["shortest_edge"]
|
||||||
else:
|
else:
|
||||||
size = self.size
|
size = {**self.size}
|
||||||
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
||||||
if min_pixels is not None:
|
if min_pixels is not None:
|
||||||
size["shortest_edge"] = min_pixels
|
size["shortest_edge"] = min_pixels
|
||||||
|
@ -21,11 +21,11 @@
|
|||||||
Processor class for Qwen2-VL.
|
Processor class for Qwen2-VL.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from ...feature_extraction_utils import BatchFeature
|
from ...feature_extraction_utils import BatchFeature
|
||||||
from ...image_utils import ImageInput, VideoInput
|
from ...image_utils import ImageInput, VideoInput
|
||||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
|
||||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
@ -33,7 +33,16 @@ from ...utils import logging
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Qwen2VLImagesKwargs(ImagesKwargs):
|
||||||
|
min_pixels: Optional[int]
|
||||||
|
max_pixels: Optional[int]
|
||||||
|
patch_size: Optional[int]
|
||||||
|
temporal_patch_size: Optional[int]
|
||||||
|
merge_size: Optional[int]
|
||||||
|
|
||||||
|
|
||||||
class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False):
|
class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
images_kwargs: Qwen2VLImagesKwargs
|
||||||
_defaults = {
|
_defaults = {
|
||||||
"text_kwargs": {
|
"text_kwargs": {
|
||||||
"padding": False,
|
"padding": False,
|
||||||
|
@ -1111,12 +1111,12 @@ class ProcessorMixin(PushToHubMixin):
|
|||||||
if isinstance(class_name, tuple):
|
if isinstance(class_name, tuple):
|
||||||
classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name)
|
classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name)
|
||||||
if attribute_name == "image_processor":
|
if attribute_name == "image_processor":
|
||||||
# TODO: @yoni, change logic in v4.50 (when use_fast set to True by default)
|
# TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
|
||||||
use_fast = kwargs.get("use_fast", None)
|
use_fast = kwargs.get("use_fast", None)
|
||||||
if use_fast is None:
|
if use_fast is None:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
|
"Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
|
||||||
"`use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. "
|
"`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
|
||||||
"This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
|
"This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
@ -310,3 +310,19 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
|
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
|
||||||
|
|
||||||
|
def test_kwargs_overrides_custom_image_processor_kwargs(self):
|
||||||
|
processor_components = self.prepare_components()
|
||||||
|
processor_components["image_processor"] = self.get_component("image_processor")
|
||||||
|
processor_components["tokenizer"] = self.get_component("tokenizer")
|
||||||
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
|
||||||
|
processor = self.processor_class(**processor_components, **processor_kwargs, use_fast=True)
|
||||||
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
|
|
||||||
|
input_str = self.prepare_text_inputs()
|
||||||
|
image_input = self.prepare_image_inputs()
|
||||||
|
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
|
||||||
|
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
|
||||||
|
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||||
|
self.assertEqual(inputs[self.images_input_name].shape[0], 800)
|
||||||
|
@ -307,3 +307,19 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
||||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
|
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
|
||||||
|
|
||||||
|
def test_kwargs_overrides_custom_image_processor_kwargs(self):
|
||||||
|
processor_components = self.prepare_components()
|
||||||
|
processor_components["image_processor"] = self.get_component("image_processor")
|
||||||
|
processor_components["tokenizer"] = self.get_component("tokenizer")
|
||||||
|
processor_kwargs = self.prepare_processor_dict()
|
||||||
|
|
||||||
|
processor = self.processor_class(**processor_components, **processor_kwargs, use_fast=True)
|
||||||
|
self.skip_processor_without_typed_kwargs(processor)
|
||||||
|
|
||||||
|
input_str = self.prepare_text_inputs()
|
||||||
|
image_input = self.prepare_image_inputs()
|
||||||
|
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||||
|
self.assertEqual(inputs[self.images_input_name].shape[0], 800)
|
||||||
|
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
|
||||||
|
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
|
||||||
|
Loading…
Reference in New Issue
Block a user