# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import shutil import tempfile import unittest from transformers import GemmaTokenizer from transformers.testing_utils import get_tests_dir, require_torch, require_vision from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin if is_vision_available(): from transformers import ( PaliGemmaProcessor, SiglipImageProcessor, is_vision_available, ) SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_vision class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = PaliGemmaProcessor def setUp(self): self.tmpdirname = tempfile.mkdtemp() image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384") image_processor.image_seq_length = 0 tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True) processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer) processor.save_pretrained(self.tmpdirname) def tearDown(self): shutil.rmtree(self.tmpdirname) @require_torch @require_vision def test_image_seq_length(self): input_str = "lower newer" image_input = self.prepare_image_inputs() image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer", max_length=112, padding="max_length") image_processor.image_seq_length = 14 processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) inputs = processor( text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" ) self.assertEqual(len(inputs["input_ids"][0]), 112 + 14)