transformers/tests/models/layoutlmv2/test_processor_layoutlmv2.py
NielsRogge 31ee80d556
Add LayoutLMv3 (#17060)
* Make forward pass work

* More improvements

* Remove unused imports

* Remove timm dependency

* Improve loss calculation of token classifier

* Fix most tests

* Add docs

* Add model integration test

* Make all tests pass

* Add LayoutLMv3FeatureExtractor

* Improve integration test + make fixup

* Add example script

* Fix style

* Add LayoutLMv3Processor

* Fix style

* Add option to add visual labels

* Make more tokenizer tests pass

* Fix more tests

* Make more tests pass

* Fix bug and improve docs

* Fix import of processors

* Improve docstrings

* Fix toctree and improve docs

* Fix auto tokenizer

* Move tests to model folder

* Move tests to model folder

* change default behavior add_prefix_space

* add prefix space for fast

* add_prefix_spcae set to True for Fast

* no space before `unique_no_split` token

* add test to hightligh special treatment of added tokens

* fix `test_batch_encode_dynamic_overflowing` by building a long enough example

* fix `test_full_tokenizer` with add_prefix_token

* Fix tokenizer integration test

* Make the code more readable

* Add tests for LayoutLMv3Processor

* Fix style

* Add model to README and update init

* Apply suggestions from code review

* Replace asserts by value errors

* Add suggestion by @ducviet00

* Add model to doc tests

* Simplify script

* Improve README

* a step ahead to fix

* Update pair_input_test

* Make all tokenizer tests pass - phew

* Make style

* Add LayoutLMv3 to CI job

* Fix auto mapping

* Fix CI job name

* Make all processor tests pass

* Make tests of LayoutLMv2 and LayoutXLM consistent

* Add copied from statements to fast tokenizer

* Add copied from statements to slow tokenizer

* Remove add_visual_labels attribute

* Fix tests

* Add link to notebooks

* Improve docs of LayoutLMv3Processor

* Fix reference to section

Co-authored-by: SaulLu <lucilesaul.com@gmail.com>
Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
2022-05-24 09:53:45 +02:00

467 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import shutil
import tempfile
import unittest
from typing import List
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
from transformers.models.layoutlmv2 import LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast
from transformers.models.layoutlmv2.tokenization_layoutlmv2 import VOCAB_FILES_NAMES
from transformers.testing_utils import require_pytesseract, require_tokenizers, require_torch, slow
from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available
if is_pytesseract_available():
from PIL import Image
from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2Processor
@require_pytesseract
@require_tokenizers
class LayoutLMv2ProcessorTest(unittest.TestCase):
tokenizer_class = LayoutLMv2Tokenizer
rust_tokenizer_class = LayoutLMv2TokenizerFast
def setUp(self):
vocab_tokens = [
"[UNK]",
"[CLS]",
"[SEP]",
"[PAD]",
"[MASK]",
"want",
"##want",
"##ed",
"wa",
"un",
"runn",
"##ing",
",",
"low",
"lowest",
]
feature_extractor_map = {
"do_resize": True,
"size": 224,
"apply_ocr": True,
}
self.tmpdirname = tempfile.mkdtemp()
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(feature_extractor_map) + "\n")
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
def get_feature_extractor(self, **kwargs):
return LayoutLMv2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_save_load_pretrained_default(self):
feature_extractor = self.get_feature_extractor()
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(self.tmpdirname)
processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast))
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
def test_save_load_pretrained_additional_features(self):
processor = LayoutLMv2Processor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer())
processor.save_pretrained(self.tmpdirname)
# slow tokenizer
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
processor = LayoutLMv2Processor.from_pretrained(
self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, LayoutLMv2Tokenizer)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
# fast tokenizer
tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30)
processor = LayoutLMv2Processor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
)
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast)
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
@slow
def test_overflowing_tokens(self):
# In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
from datasets import load_dataset
# set up
datasets = load_dataset("nielsr/funsd")
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
def preprocess_data(examples):
images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
words = examples["words"]
boxes = examples["bboxes"]
word_labels = examples["ner_tags"]
encoded_inputs = processor(
images,
words,
boxes=boxes,
word_labels=word_labels,
padding="max_length",
truncation=True,
return_overflowing_tokens=True,
stride=50,
return_offsets_mapping=True,
return_tensors="pt",
)
return encoded_inputs
train_data = preprocess_data(datasets["train"])
self.assertEqual(len(train_data["image"]), len(train_data["input_ids"]))
# different use cases tests
@require_torch
@require_pytesseract
class LayoutLMv2ProcessorIntegrationTests(unittest.TestCase):
@cached_property
def get_images(self):
# we verify our implementation on 2 document images from the DocVQA dataset
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
image_1 = Image.open(ds[0]["file"]).convert("RGB")
image_2 = Image.open(ds[1]["file"]).convert("RGB")
return image_1, image_2
@cached_property
def get_tokenizers(self):
slow_tokenizer = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased")
fast_tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
return [slow_tokenizer, fast_tokenizer]
@slow
def test_processor_case_1(self):
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
feature_extractor = LayoutLMv2FeatureExtractor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
input_feat_extract = feature_extractor(images[0], return_tensors="pt")
input_processor = processor(images[0], return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify image
self.assertAlmostEqual(
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "[CLS] 11 : 14 to 11 : 39 a. m 11 : 39 to 11 : 44 a. m. 11 : 44 a. m. to 12 : 25 p. m. 12 : 25 to 12 : 58 p. m. 12 : 58 to 4 : 00 p. m. 2 : 00 to 5 : 00 p. m. coffee break coffee will be served for men and women in the lobby adjacent to exhibit area. please move into exhibit area. ( exhibits open ) trrf general session ( part | ) presiding : lee a. waller trrf vice president “ introductory remarks ” lee a. waller, trrf vice presi - dent individual interviews with trrf public board members and sci - entific advisory council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public refrigerated warehousing industry is looking for. plus questions from the floor. dr. emil m. mrak, university of cal - ifornia, chairman, trrf board ; sam r. cecil, university of georgia college of agriculture ; dr. stanley charm, tufts university school of medicine ; dr. robert h. cotton, itt continental baking company ; dr. owen fennema, university of wis - consin ; dr. robert e. hardenburg, usda. questions and answers exhibits open capt. jack stoney room trrf scientific advisory council meeting ballroom foyer [SEP]" # noqa: E231
# fmt: on
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
input_feat_extract = feature_extractor(images, return_tensors="pt")
input_processor = processor(images, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify images
self.assertAlmostEqual(
input_feat_extract["pixel_values"].sum(), input_processor["image"].sum(), delta=1e-2
)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "[CLS] 7 itc limited report and accounts 2013 itc s brands : an asset for the nation the consumer needs and aspirations they fulfil, the benefit they generate for millions across itc s value chains, the future - ready capabilities that support them, and the value that they create for the country, have made itc s brands national assets, adding to india s competitiveness. it is itc s aspiration to be the no 1 fmcg player in the country, driven by its new fmcg businesses. a recent nielsen report has highlighted that itc's new fmcg businesses are the fastest growing among the top consumer goods companies operating in india. itc takes justifiable pride that, along with generating economic value, these celebrated indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. di wills * ; love delightfully soft skin? aia ans source : https : / / www. industrydocuments. ucsf. edu / docs / snbx0223 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]" # noqa: E231
# fmt: on
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
@slow
def test_processor_case_2(self):
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt")
# verify keys
expected_keys = ["input_ids", "bbox", "token_type_ids", "attention_mask", "image"]
actual_keys = list(input_processor.keys())
for key in expected_keys:
self.assertIn(key, actual_keys)
# verify input_ids
expected_decoding = "[CLS] hello world [SEP]"
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] hello world [SEP] [PAD] [PAD] [PAD]"
decoding = processor.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [
[0, 0, 0, 0],
[3, 2, 5, 1],
[6, 7, 4, 2],
[3, 9, 2, 4],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1000, 1000, 1000, 1000],
]
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
@slow
def test_processor_case_3(self):
# case 3: token classification (training), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
words = ["weirdly", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
word_labels = [1, 2]
input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] weirdly world [SEP]"
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify labels
expected_labels = [-100, 1, -100, 2, -100]
self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels)
# batched
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
word_labels = [[1, 2], [6, 3, 10, 2]]
input_processor = processor(
images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "labels", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] my name is niels [SEP]"
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [
[0, 0, 0, 0],
[3, 2, 5, 1],
[6, 7, 4, 2],
[3, 9, 2, 4],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1000, 1000, 1000, 1000],
]
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
# verify labels
expected_labels = [-100, 6, 3, 10, 2, -100, -100]
self.assertListEqual(input_processor.labels[1].tolist(), expected_labels)
@slow
def test_processor_case_4(self):
# case 4: visual question answering (inference), apply_ocr=True
feature_extractor = LayoutLMv2FeatureExtractor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
input_processor = processor(images[0], question, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "[CLS] what's his name? [SEP] 11 : 14 to 11 : 39 a. m 11 : 39 to 11 : 44 a. m. 11 : 44 a. m. to 12 : 25 p. m. 12 : 25 to 12 : 58 p. m. 12 : 58 to 4 : 00 p. m. 2 : 00 to 5 : 00 p. m. coffee break coffee will be served for men and women in the lobby adjacent to exhibit area. please move into exhibit area. ( exhibits open ) trrf general session ( part | ) presiding : lee a. waller trrf vice president “ introductory remarks ” lee a. waller, trrf vice presi - dent individual interviews with trrf public board members and sci - entific advisory council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public refrigerated warehousing industry is looking for. plus questions from the floor. dr. emil m. mrak, university of cal - ifornia, chairman, trrf board ; sam r. cecil, university of georgia college of agriculture ; dr. stanley charm, tufts university school of medicine ; dr. robert h. cotton, itt continental baking company ; dr. owen fennema, university of wis - consin ; dr. robert e. hardenburg, usda. questions and answers exhibits open capt. jack stoney room trrf scientific advisory council meeting ballroom foyer [SEP]" # noqa: E231
# fmt: on
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
questions = ["How old is he?", "what's the time"]
input_processor = processor(
images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
# this was obtained with Tesseract 4.1.1
expected_decoding = "[CLS] what's the time [SEP] 7 itc limited report and accounts 2013 itc s [SEP]"
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
# fmt: off
expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [372, 59, 407, 66], [74, 136, 161, 158], [74, 136, 161, 158], [74, 136, 161, 158], [74, 136, 161, 158], [1000, 1000, 1000, 1000]] # noqa: E231
# fmt: on
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
@slow
def test_processor_case_5(self):
# case 5: visual question answering (inference), apply_ocr=False
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
input_processor = processor(images[0], question, words, boxes, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] what's his name? [SEP] hello world [SEP]"
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
questions = ["How old is he?", "what's the time"]
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "image", "input_ids", "token_type_ids"]
actual_keys = sorted(list(input_processor.keys()))
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "[CLS] how old is he? [SEP] hello world [SEP] [PAD] [PAD] [PAD]"
decoding = processor.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
expected_decoding = "[CLS] what's the time [SEP] my name is niels [SEP]"
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [[6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox)