fix bug in group_texts function, that was inserting short batches (#23429)

* fix bug in group_texts function, that was inserting short batches * fully exclude short batches and return empty dict instead * fix style
2025-07-13 17:48:22 +06:00 · 2023-05-18 21:22:30 +03:00 · 2023-05-18 21:22:30 +03:00 · a7920065f2
commit a7920065f2
parent b7b81d9344
5 changed files with 15 additions and 20 deletions
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -491,10 +491,9 @@ def main():
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
-        # customize this part to your needs.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-        if total_length >= block_size:
+        total_length = (total_length // block_size) * block_size
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -434,10 +434,9 @@ def main():
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
-        # customize this part to your needs.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-        if total_length >= block_size:
+        total_length = (total_length // block_size) * block_size
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -506,10 +506,9 @@ def main():
            # Concatenate all texts.
            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
-            # customize this part to your needs.
+            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-            if total_length >= max_seq_length:
+            total_length = (total_length // max_seq_length) * max_seq_length
                total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -472,10 +472,9 @@ def main():
            # Concatenate all texts.
            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
-            # customize this part to your needs.
+            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-            if total_length >= max_seq_length:
+            total_length = (total_length // max_seq_length) * max_seq_length
                total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -450,10 +450,9 @@ def main():
            # Concatenate all texts.
            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
-            # customize this part to your needs.
+            # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
-            if total_length >= max_seq_length:
+            total_length = (total_length // max_seq_length) * max_seq_length
                total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]