DocumentQuestionAnsweringPipeline only for fast ⚡ tokenizers (#22745)

* fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-07-04 13:20:12 +06:00 · 2023-04-13 17:22:59 +02:00 · 2023-04-13 17:22:59 +02:00 · 32b08742a5
commit 32b08742a5
parent 4def2fe969
4 changed files with 18 additions and 22 deletions
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@ -131,6 +131,11 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
+        if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"):
+            raise ValueError(
+                "`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer "
+                f"(`{self.tokenizer.__class__.__name__}`) is provided."
+            )

        if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig":
            self.model_type = ModelType.VisionEncoderDecoder
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@ -246,20 +246,6 @@ class LayoutLMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
    )
    fx_compatible = True

-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if (
-            pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # This pipeline uses `sequence_ids()` which is only available for fast tokenizers.
-            return True
-
-        return False
-
    def setUp(self):
        self.model_tester = LayoutLMModelTester(self)
        self.config_tester = ConfigTester(self, config_class=LayoutLMConfig, hidden_size=37)
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@ -295,15 +295,10 @@ class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
            # `LayoutLMv2Config` was never used in pipeline tests (`test_pt_LayoutLMv2Config_XXX`) due to lack of tiny
            # config. With new tiny model creation, it is available, but we need to fix the failed tests.
            return True
-        elif (
-            pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # This pipeline uses `sequence_ids()` which is only available for fast tokenizers.
-            return True

-        return False
+        return super().is_pipeline_test_to_skip(
+            pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        )

    def setUp(self):
        self.model_tester = LayoutLMv2ModelTester(self)
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@ -428,9 +428,19 @@ class PipelineTesterMixin:
    def test_pipeline_zero_shot_object_detection(self):
        self.run_task_tests(task="zero-shot-object-detection")

+    # This contains the test cases to be skipped without model architecture being involved.
    def is_pipeline_test_to_skip(
        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
    ):
+        # No fix is required for this case.
+        if (
+            pipeline_test_casse_name == "DocumentQuestionAnsweringPipelineTests"
+            and tokenizer_name is not None
+            and not tokenizer_name.endswith("Fast")
+        ):
+            # `DocumentQuestionAnsweringPipelineTests` requires a fast tokenizer.
+            return True
+
        return False