# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import shutil import tempfile import unittest from transformers import GemmaTokenizer from transformers.testing_utils import get_tests_dir, require_torch, require_vision from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): from transformers import ( PaliGemmaProcessor, SiglipImageProcessor, is_vision_available, ) SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_vision class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = PaliGemmaProcessor def setUp(self): self.tmpdirname = tempfile.mkdtemp() image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384") image_processor.image_seq_length = 0 tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True) processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer) processor.save_pretrained(self.tmpdirname) def tearDown(self): shutil.rmtree(self.tmpdirname) @require_torch @require_vision def test_image_seq_length(self): input_str = "lower newer" image_input = self.prepare_image_inputs() image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer", max_length=112, padding="max_length") image_processor.image_seq_length = 14 processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) inputs = processor( text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" ) self.assertEqual(len(inputs["input_ids"][0]), 112 + 14) @require_torch @require_vision def test_unstructured_kwargs_batched(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer", "upper older longer string"] image_input = self.prepare_image_inputs() * 2 inputs = processor( text=input_str, images=image_input, return_tensors="pt", size={"height": 214, "width": 214}, padding="longest", max_length=76, ) self.assertEqual(inputs["pixel_values"].shape[2], 214) self.assertEqual(len(inputs["input_ids"][0]), 10)