Switch from using sum for flattening lists of lists in group_texts (#14472)

* remove sum for list flattening * change to chain(*) * make chain object a list * delete empty lines per sgugger's suggestions Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Nicholas Broad <nicholas@nmbroad.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2025-08-01 18:51:14 +06:00 · 2021-11-22 16:17:26 -05:00 · 2021-11-22 16:17:26 -05:00 · 69e16abf98
commit 69e16abf98
parent 0b7d053c13
15 changed files with 35 additions and 20 deletions
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@ -27,6 +27,7 @@ import os
 import sys
 import time
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
 from typing import Callable, Optional
@ -430,7 +431,7 @@ def main():
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@ -25,6 +25,7 @@ import os
 import sys
 import time
 from dataclasses import dataclass, field
 from itertools import chain
 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 from pathlib import Path
@ -453,7 +454,7 @@ if __name__ == "__main__":
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@ -25,6 +25,7 @@ import os
 import sys
 import time
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
 from typing import Dict, List, Optional
@ -563,7 +564,7 @@ if __name__ == "__main__":
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -26,6 +26,7 @@ import math
 import os
 import sys
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
 import datasets
@ -408,7 +409,7 @@ def main():
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -27,6 +27,7 @@ import logging
 import math
 import os
 import random
 from itertools import chain
 from pathlib import Path
 import datasets
@ -366,7 +367,7 @@ def main():
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -26,6 +26,7 @@ import math
 import os
 import sys
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
 import datasets
@ -432,7 +433,7 @@ def main():
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -27,6 +27,7 @@ import logging
 import math
 import os
 import random
 from itertools import chain
 from pathlib import Path
 import datasets
@ -406,7 +407,7 @@ def main():
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -23,6 +23,7 @@ import math
 import os
 import sys
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional
 import datasets
@ -403,7 +404,7 @@ def main():
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@ -22,6 +22,7 @@ import logging
 import os
 import sys
 from dataclasses import dataclass, field
 from itertools import chain
 from typing import Optional, Union
 import datasets
@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice:
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
-        flattened_features = sum(flattened_features, [])
+        flattened_features = list(chain(*flattened_features))
        batch = self.tokenizer.pad(
            flattened_features,
@ -333,8 +334,8 @@ def main():
        ]
        # Flatten out
-        first_sentences = sum(first_sentences, [])
+        first_sentences = list(chain(*first_sentences))
-        second_sentences = sum(second_sentences, [])
+        second_sentences = list(chain(*second_sentences))
        # Tokenize
        tokenized_examples = tokenizer(
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -24,6 +24,7 @@ import math
 import os
 import random
 from dataclasses import dataclass
 from itertools import chain
 from pathlib import Path
 from typing import Optional, Union
@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice:
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
-        flattened_features = sum(flattened_features, [])
+        flattened_features = list(chain(*flattened_features))
        batch = self.tokenizer.pad(
            flattened_features,
@ -365,8 +366,8 @@ def main():
        labels = examples[label_column_name]
        # Flatten out
-        first_sentences = sum(first_sentences, [])
+        first_sentences = list(chain(*first_sentences))
-        second_sentences = sum(second_sentences, [])
+        second_sentences = list(chain(*second_sentences))
        # Tokenize
        tokenized_examples = tokenizer(
--- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
+++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
@ -23,6 +23,7 @@ import os
 import sys
 import time
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
 from typing import Callable, Optional
@ -364,7 +365,7 @@ def main():
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@ -30,6 +30,7 @@ import random
 import sys
 from dataclasses import dataclass, field
 from functools import partial
 from itertools import chain
 from pathlib import Path
 from typing import Optional
@ -406,7 +407,7 @@ def main():
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@ -32,6 +32,7 @@ import random
 import sys
 from dataclasses import dataclass, field
 from functools import partial
 from itertools import chain
 from pathlib import Path
 from typing import Optional
@ -462,7 +463,7 @@ def main():
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@ -22,6 +22,7 @@ import logging
 import os
 import sys
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
 from typing import Optional
@ -342,8 +343,8 @@ def main():
        ]
        # Flatten out
-        first_sentences = sum(first_sentences, [])
+        first_sentences = list(chain(*first_sentences))
-        second_sentences = sum(second_sentences, [])
+        second_sentences = list(chain(*second_sentences))
        # Tokenize
        tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@ -35,6 +35,7 @@ from dataclasses import fields
 from enum import Enum
 from functools import partial, wraps
 from hashlib import sha256
 from itertools import chain
 from pathlib import Path
 from types import ModuleType
 from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union
@ -2129,7 +2130,7 @@ class _LazyModule(ModuleType):
            for value in values:
                self._class_to_module[value] = key
        # Needed for autocompletion in an IDE
-        self.__all__ = list(import_structure.keys()) + sum(import_structure.values(), [])
+        self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
        self.__file__ = module_file
        self.__spec__ = module_spec
        self.__path__ = [os.path.dirname(module_file)]