Add line by line option to mlm/plm scripts (#8240)

* Make line by line optional in run_mlm * Add option to disable dynamic padding * Add option to plm too and update README * Typos * More typos * Even more typos * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
2025-08-01 02:31:11 +06:00 · 2020-11-02 12:27:04 -05:00 · 2020-11-02 12:27:04 -05:00 · e1b1b614b1
commit e1b1b614b1
parent ebec410c71
4 changed files with 181 additions and 26 deletions
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@ -77,10 +77,16 @@ python run_clm.py \
    --output_dir /tmp/test-clm
 ```

+If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
+concatenates all texts and then splits them in blocks of the same length).
+
+**Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make
+sure all your batches have the same length.
+
 ### Whole word masking

 The BERT authors released a new version of BERT using Whole Word Masking in May 2019. Instead of masking randomly
-selected tokens (which may be aprt of words), they mask randomly selected words (masking all the tokens corresponding
+selected tokens (which may be part of words), they mask randomly selected words (masking all the tokens corresponding
 to that word). This technique has been refined for Chinese in [this paper](https://arxiv.org/abs/1906.08101).

 To fine-tune a model using whole word masking, use the following script:
@ -111,8 +117,8 @@ It works well on so many Chines Task like CLUE (Chinese GLUE). They use LTP, so
 we need LTP.

 Now LTP only only works well on `transformers==3.2.0`. So we don't add it to requirements.txt.
-You need to create a separate enviromnent with this version of Transformers to run the `run_chinese_ref.py` script that
-will create the reference files. The script is in `examples/contrib`. Once in the proper enviromnent, run the
+You need to create a separate environment with this version of Transformers to run the `run_chinese_ref.py` script that
+will create the reference files. The script is in `examples/contrib`. Once in the proper environment, run the
 following:


@ -144,6 +150,8 @@ python run_mlm_wwm.py \
    --output_dir /tmp/test-mlm-wwm
 ```

+**Note:** On TPU, you should the flag `--pad_to_max_length` to make sure all your batches have the same length.
+
 ### XLNet and permutation language modeling

 XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method 
@ -179,3 +187,9 @@ python run_plm.py \
    --do_eval \
    --output_dir /tmp/test-plm
 ```
+
+If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
+concatenates all texts and then splits them in blocks of the same length).
+
+**Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make
+sure all your batches have the same length.
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@ -116,6 +116,17 @@ class DataTrainingArguments:
    mlm_probability: float = field(
        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@ -246,18 +257,73 @@ def main():
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

-    def tokenize_function(examples):
-        # Remove empty lines
-        examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
-        return tokenizer(examples["text"], truncation=True, max_length=data_args.max_seq_length)
+    if data_args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if data_args.pad_to_max_length else False

-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        remove_columns=[text_column_name],
-        load_from_cache_file=not data_args.overwrite_cache,
-    )
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
+            return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)
+
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=[text_column_name],
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name])
+
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=[text_column_name],
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+        if data_args.max_seq_length is None:
+            max_seq_length = tokenizer.model_max_length
+        else:
+            if data_args.max_seq_length > tokenizer.model_max_length:
+                logger.warn(
+                    f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                    f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+                )
+            max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )

    # Data collator
    # This one will take care of randomly masking the tokens.
--- a/examples/language-modeling/run_mlm_wwm.py
+++ b/examples/language-modeling/run_mlm_wwm.py
@ -120,6 +120,13 @@ class DataTrainingArguments:
    mlm_probability: float = field(
        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )

    def __post_init__(self):
        if self.train_file is not None:
@ -253,10 +260,12 @@ def main():
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

+    padding = "max_length" if data_args.pad_to_max_length else False
+
    def tokenize_function(examples):
        # Remove empty lines
        examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
-        return tokenizer(examples["text"], truncation=True, max_length=data_args.max_seq_length)
+        return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)

    tokenized_datasets = datasets.map(
        tokenize_function,
--- a/examples/language-modeling/run_plm.py
+++ b/examples/language-modeling/run_plm.py
@ -113,6 +113,17 @@ class DataTrainingArguments:
    max_span_length: int = field(
        default=5, metadata={"help": "Maximum length of a span of masked tokens for permutation language modeling."}
    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@ -243,18 +254,73 @@ def main():
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

-    def tokenize_function(examples):
-        # Remove empty lines
-        examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
-        return tokenizer(examples["text"], truncation=True, max_length=data_args.max_seq_length)
+    if data_args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if data_args.pad_to_max_length else False

-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        remove_columns=[text_column_name],
-        load_from_cache_file=not data_args.overwrite_cache,
-    )
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
+            return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)
+
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=[text_column_name],
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name])
+
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=[text_column_name],
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+
+        if data_args.max_seq_length is None:
+            max_seq_length = tokenizer.model_max_length
+        else:
+            if data_args.max_seq_length > tokenizer.model_max_length:
+                logger.warn(
+                    f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                    f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+                )
+            max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )

    # Data collator
    data_collator = DataCollatorForPermutationLanguageModeling(