add preprocessing_num_workers to run_classification.py (#31586)

preprocessing_num_workers option to speedup preprocess
2025-07-31 02:02:21 +06:00 · 2024-06-25 19:35:50 +08:00 · 2024-06-25 19:35:50 +08:00 · e73a97a2b3
commit e73a97a2b3
parent fc689d75a0
1 changed files with 5 additions and 0 deletions
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@ -133,6 +133,10 @@ class DataTrainingArguments:
            )
        },
    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
@ -573,6 +577,7 @@ def main():
        raw_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )