From 32b08742a58b43a5a905a28e434b8f67321be024 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 13 Apr 2023 17:22:59 +0200 Subject: [PATCH] =?UTF-8?q?`DocumentQuestionAnsweringPipeline`=20only=20fo?= =?UTF-8?q?r=20fast=20=E2=9A=A1=20tokenizers=20(#22745)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix --------- Co-authored-by: ydshieh --- .../pipelines/document_question_answering.py | 5 +++++ tests/models/layoutlm/test_modeling_layoutlm.py | 14 -------------- .../models/layoutlmv2/test_modeling_layoutlmv2.py | 11 +++-------- tests/test_pipeline_mixin.py | 10 ++++++++++ 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py index 78f49a5e2da..936d728b598 100644 --- a/src/transformers/pipelines/document_question_answering.py +++ b/src/transformers/pipelines/document_question_answering.py @@ -131,6 +131,11 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"): + raise ValueError( + "`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer " + f"(`{self.tokenizer.__class__.__name__}`) is provided." + ) if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig": self.model_type = ModelType.VisionEncoderDecoder diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py index d2aad061c38..0535fbf4e1f 100644 --- a/tests/models/layoutlm/test_modeling_layoutlm.py +++ b/tests/models/layoutlm/test_modeling_layoutlm.py @@ -246,20 +246,6 @@ class LayoutLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase ) fx_compatible = True - # TODO: Fix the failed tests - def is_pipeline_test_to_skip( - self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name - ): - if ( - pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests" - and tokenizer_name is not None - and not tokenizer_name.endswith("Fast") - ): - # This pipeline uses `sequence_ids()` which is only available for fast tokenizers. - return True - - return False - def setUp(self): self.model_tester = LayoutLMModelTester(self) self.config_tester = ConfigTester(self, config_class=LayoutLMConfig, hidden_size=37) diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index 6c82a34a626..2b17eadff57 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -295,15 +295,10 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa # `LayoutLMv2Config` was never used in pipeline tests (`test_pt_LayoutLMv2Config_XXX`) due to lack of tiny # config. With new tiny model creation, it is available, but we need to fix the failed tests. return True - elif ( - pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests" - and tokenizer_name is not None - and not tokenizer_name.endswith("Fast") - ): - # This pipeline uses `sequence_ids()` which is only available for fast tokenizers. - return True - return False + return super().is_pipeline_test_to_skip( + pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name + ) def setUp(self): self.model_tester = LayoutLMv2ModelTester(self) diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py index 82a23a94b40..a73121966c3 100644 --- a/tests/test_pipeline_mixin.py +++ b/tests/test_pipeline_mixin.py @@ -428,9 +428,19 @@ class PipelineTesterMixin: def test_pipeline_zero_shot_object_detection(self): self.run_task_tests(task="zero-shot-object-detection") + # This contains the test cases to be skipped without model architecture being involved. def is_pipeline_test_to_skip( self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name ): + # No fix is required for this case. + if ( + pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests" + and tokenizer_name is not None + and not tokenizer_name.endswith("Fast") + ): + # `DocumentQuestionAnsweringPipelineTests` requires a fast tokenizer. + return True + return False