mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 19:21:31 +06:00
Load and save video-processor from separate folder (#33562)
* load and save from video-processor folder * Update src/transformers/models/llava_onevision/processing_llava_onevision.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
parent
5af7d41e49
commit
e40bb4845e
@ -621,6 +621,7 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
size = get_size_dict(size, default_to_square=False)
|
||||
image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
|
@ -17,6 +17,7 @@ Processor class for LLaVa-Onevision.
|
||||
"""
|
||||
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
from typing import Iterable, List, Union
|
||||
|
||||
@ -34,6 +35,11 @@ from ...processing_utils import (
|
||||
ProcessorMixin,
|
||||
)
|
||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
from ...utils import logging
|
||||
from ..auto import AutoImageProcessor
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
|
||||
@ -96,7 +102,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
||||
chat_template=None,
|
||||
image_token="<image>",
|
||||
video_token="<video>",
|
||||
**kwargs: Unpack[LlavaOnevisionProcessorKwargs],
|
||||
**kwargs,
|
||||
):
|
||||
self.num_image_tokens = num_image_tokens
|
||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||
@ -109,7 +115,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
||||
images: ImageInput = None,
|
||||
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
||||
videos: VideoInput = None,
|
||||
**kwargs,
|
||||
**kwargs: Unpack[LlavaOnevisionProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||
@ -272,3 +278,46 @@ class LlavaOnevisionProcessor(ProcessorMixin):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
|
||||
# override to save video-config in a separate config file
|
||||
def save_pretrained(self, save_directory, **kwargs):
|
||||
if os.path.isfile(save_directory):
|
||||
raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
|
||||
os.makedirs(save_directory, exist_ok=True)
|
||||
video_processor_path = os.path.join(save_directory, "video_processor")
|
||||
self.video_processor.save_pretrained(video_processor_path)
|
||||
|
||||
video_processor_present = "video_processor" in self.attributes
|
||||
if video_processor_present:
|
||||
self.attributes.remove("video_processor")
|
||||
|
||||
outputs = super().save_pretrained(save_directory, **kwargs)
|
||||
|
||||
if video_processor_present:
|
||||
self.attributes += ["video_processor"]
|
||||
return outputs
|
||||
|
||||
# override to load video-config from a separate config file
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||
processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
|
||||
if isinstance(processor, tuple):
|
||||
processor = processor[0]
|
||||
|
||||
try:
|
||||
video_processor = AutoImageProcessor.from_pretrained(
|
||||
pretrained_model_name_or_path, subfolder="video_processor"
|
||||
)
|
||||
processor.video_processor = video_processor
|
||||
except EnvironmentError:
|
||||
# this means users are using prev version of saved processor where we had only one preprocessor_config.json
|
||||
# for loading back that should work and load a LlavaOnevisionVideoProcessor class
|
||||
logger.info(
|
||||
"You are loading `LlavaOnevisionProcessor` but the indicated `path` doesn't contain a folder called "
|
||||
"`video_processor`. It is strongly recommended to load and save the processor again so the video processor is saved "
|
||||
"in a separate config."
|
||||
)
|
||||
|
||||
return processor
|
||||
|
@ -58,15 +58,16 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
|
||||
|
||||
def prepare_processor_dict(self):
|
||||
return {"chat_template": "dummy_template"}
|
||||
return {"chat_template": "dummy_template", "num_image_tokens": 6, "vision_feature_select_strategy": "default"}
|
||||
|
||||
@unittest.skip(
|
||||
"Skip because the model has no processor kwargs except for chat template and"
|
||||
"chat template is saved as a separate file. Stop skipping this test when the processor"
|
||||
"has new kwargs saved in config file."
|
||||
)
|
||||
def test_processor_to_json_string(self):
|
||||
pass
|
||||
processor = self.get_processor()
|
||||
obj = json.loads(processor.to_json_string())
|
||||
for key, value in self.prepare_processor_dict().items():
|
||||
# chat_tempalate are tested as a separate test because they are saved in separate files
|
||||
if key != "chat_template":
|
||||
self.assertEqual(obj[key], value)
|
||||
self.assertEqual(getattr(processor, key, None), value)
|
||||
|
||||
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
|
||||
def test_chat_template_is_saved(self):
|
||||
@ -191,7 +192,7 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
max_length=76,
|
||||
)
|
||||
self.assertEqual(inputs["pixel_values"].shape[3], 214)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 5)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 4)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
@ -282,7 +283,7 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 112)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 2)
|
||||
|
||||
@require_vision
|
||||
@require_torch
|
||||
@ -299,4 +300,4 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 117)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 2)
|
||||
|
@ -112,6 +112,14 @@ class ProcessorTesterMixin:
|
||||
|
||||
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
|
||||
|
||||
for attribute in processor_first.attributes:
|
||||
attribute_first = getattr(processor_first, attribute)
|
||||
attribute_second = getattr(processor_second, attribute)
|
||||
|
||||
# tokenizer repr contains model-path from where we loaded
|
||||
if "tokenizer" not in attribute:
|
||||
self.assertEqual(repr(attribute_first), repr(attribute_second))
|
||||
|
||||
# These kwargs-related tests ensure that processors are correctly instantiated.
|
||||
# they need to be applied only if an image_processor exists.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user