When using the new padding/truncation paradigm setting padding="max_length" + max_length=X actually pads the input up to max_length.

This result in every sample going through QA pipelines to be of size 384 whatever the actual input size is making the overall pipeline very slow. Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>
2025-07-30 17:52:35 +06:00 · 2020-07-01 10:39:09 +02:00 · 2020-07-01 10:39:09 +02:00 · 1b00a9a2ff
commit 1b00a9a2ff
parent 90fbc544ca
1 changed files with 2 additions and 2 deletions
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@ -10,7 +10,7 @@ from tqdm import tqdm
 from ...file_utils import is_tf_available, is_torch_available
 from ...tokenization_bert import whitespace_tokenize
 from .utils import DataProcessor
-
+from ...tokenization_utils_base import PaddingStrategy

 if is_torch_available():
    import torch
@ -137,7 +137,7 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
            truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
            span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
            truncation="only_second" if tokenizer.padding_side == "right" else "only_first",
-            padding="max_length",
+            padding=PaddingStrategy.DO_NOT_PAD,
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,