Switch from using sum for flattening lists of lists in group_texts (#14472)

* remove sum for list flattening

* change to chain(*)

* make chain object a list

* delete empty lines

per sgugger's suggestions

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Co-authored-by: Nicholas Broad <nicholas@nmbroad.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Nicholas Broad 2021-11-22 16:17:26 -05:00 committed by GitHub
parent 0b7d053c13
commit 69e16abf98
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 35 additions and 20 deletions

View File

@ -27,6 +27,7 @@ import os
import sys
import time
from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path
from typing import Callable, Optional
@ -430,7 +431,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -25,6 +25,7 @@ import os
import sys
import time
from dataclasses import dataclass, field
from itertools import chain
# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
from pathlib import Path
@ -453,7 +454,7 @@ if __name__ == "__main__":
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -25,6 +25,7 @@ import os
import sys
import time
from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path
from typing import Dict, List, Optional
@ -563,7 +564,7 @@ if __name__ == "__main__":
# Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -26,6 +26,7 @@ import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import datasets
@ -408,7 +409,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -27,6 +27,7 @@ import logging
import math
import os
import random
from itertools import chain
from pathlib import Path
import datasets
@ -366,7 +367,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -26,6 +26,7 @@ import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import datasets
@ -432,7 +433,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -27,6 +27,7 @@ import logging
import math
import os
import random
from itertools import chain
from pathlib import Path
import datasets
@ -406,7 +407,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -23,6 +23,7 @@ import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import datasets
@ -403,7 +404,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -22,6 +22,7 @@ import logging
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional, Union
import datasets
@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice:
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = sum(flattened_features, [])
flattened_features = list(chain(*flattened_features))
batch = self.tokenizer.pad(
flattened_features,
@ -333,8 +334,8 @@ def main():
]
# Flatten out
first_sentences = sum(first_sentences, [])
second_sentences = sum(second_sentences, [])
first_sentences = list(chain(*first_sentences))
second_sentences = list(chain(*second_sentences))
# Tokenize
tokenized_examples = tokenizer(

View File

@ -24,6 +24,7 @@ import math
import os
import random
from dataclasses import dataclass
from itertools import chain
from pathlib import Path
from typing import Optional, Union
@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice:
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = sum(flattened_features, [])
flattened_features = list(chain(*flattened_features))
batch = self.tokenizer.pad(
flattened_features,
@ -365,8 +366,8 @@ def main():
labels = examples[label_column_name]
# Flatten out
first_sentences = sum(first_sentences, [])
second_sentences = sum(second_sentences, [])
first_sentences = list(chain(*first_sentences))
second_sentences = list(chain(*second_sentences))
# Tokenize
tokenized_examples = tokenizer(

View File

@ -23,6 +23,7 @@ import os
import sys
import time
from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path
from typing import Callable, Optional
@ -364,7 +365,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -30,6 +30,7 @@ import random
import sys
from dataclasses import dataclass, field
from functools import partial
from itertools import chain
from pathlib import Path
from typing import Optional
@ -406,7 +407,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -32,6 +32,7 @@ import random
import sys
from dataclasses import dataclass, field
from functools import partial
from itertools import chain
from pathlib import Path
from typing import Optional
@ -462,7 +463,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@ -22,6 +22,7 @@ import logging
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path
from typing import Optional
@ -342,8 +343,8 @@ def main():
]
# Flatten out
first_sentences = sum(first_sentences, [])
second_sentences = sum(second_sentences, [])
first_sentences = list(chain(*first_sentences))
second_sentences = list(chain(*second_sentences))
# Tokenize
tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)

View File

@ -35,6 +35,7 @@ from dataclasses import fields
from enum import Enum
from functools import partial, wraps
from hashlib import sha256
from itertools import chain
from pathlib import Path
from types import ModuleType
from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union
@ -2129,7 +2130,7 @@ class _LazyModule(ModuleType):
for value in values:
self._class_to_module[value] = key
# Needed for autocompletion in an IDE
self.__all__ = list(import_structure.keys()) + sum(import_structure.values(), [])
self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
self.__file__ = module_file
self.__spec__ = module_spec
self.__path__ = [os.path.dirname(module_file)]