mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 18:51:14 +06:00
Switch from using sum for flattening lists of lists in group_texts (#14472)
* remove sum for list flattening * change to chain(*) * make chain object a list * delete empty lines per sgugger's suggestions Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Nicholas Broad <nicholas@nmbroad.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
parent
0b7d053c13
commit
69e16abf98
@ -27,6 +27,7 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
@ -430,7 +431,7 @@ def main():
|
|||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -25,6 +25,7 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from itertools import chain
|
||||||
|
|
||||||
# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
|
# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -453,7 +454,7 @@ if __name__ == "__main__":
|
|||||||
# max_seq_length.
|
# max_seq_length.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -25,6 +25,7 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
@ -563,7 +564,7 @@ if __name__ == "__main__":
|
|||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -26,6 +26,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from itertools import chain
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@ -408,7 +409,7 @@ def main():
|
|||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -27,6 +27,7 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@ -366,7 +367,7 @@ def main():
|
|||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -26,6 +26,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from itertools import chain
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@ -432,7 +433,7 @@ def main():
|
|||||||
# max_seq_length.
|
# max_seq_length.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -27,6 +27,7 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@ -406,7 +407,7 @@ def main():
|
|||||||
# max_seq_length.
|
# max_seq_length.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -23,6 +23,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from itertools import chain
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@ -403,7 +404,7 @@ def main():
|
|||||||
# max_seq_length.
|
# max_seq_length.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -22,6 +22,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from itertools import chain
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice:
|
|||||||
flattened_features = [
|
flattened_features = [
|
||||||
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||||
]
|
]
|
||||||
flattened_features = sum(flattened_features, [])
|
flattened_features = list(chain(*flattened_features))
|
||||||
|
|
||||||
batch = self.tokenizer.pad(
|
batch = self.tokenizer.pad(
|
||||||
flattened_features,
|
flattened_features,
|
||||||
@ -333,8 +334,8 @@ def main():
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Flatten out
|
# Flatten out
|
||||||
first_sentences = sum(first_sentences, [])
|
first_sentences = list(chain(*first_sentences))
|
||||||
second_sentences = sum(second_sentences, [])
|
second_sentences = list(chain(*second_sentences))
|
||||||
|
|
||||||
# Tokenize
|
# Tokenize
|
||||||
tokenized_examples = tokenizer(
|
tokenized_examples = tokenizer(
|
||||||
|
@ -24,6 +24,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice:
|
|||||||
flattened_features = [
|
flattened_features = [
|
||||||
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
|
||||||
]
|
]
|
||||||
flattened_features = sum(flattened_features, [])
|
flattened_features = list(chain(*flattened_features))
|
||||||
|
|
||||||
batch = self.tokenizer.pad(
|
batch = self.tokenizer.pad(
|
||||||
flattened_features,
|
flattened_features,
|
||||||
@ -365,8 +366,8 @@ def main():
|
|||||||
labels = examples[label_column_name]
|
labels = examples[label_column_name]
|
||||||
|
|
||||||
# Flatten out
|
# Flatten out
|
||||||
first_sentences = sum(first_sentences, [])
|
first_sentences = list(chain(*first_sentences))
|
||||||
second_sentences = sum(second_sentences, [])
|
second_sentences = list(chain(*second_sentences))
|
||||||
|
|
||||||
# Tokenize
|
# Tokenize
|
||||||
tokenized_examples = tokenizer(
|
tokenized_examples = tokenizer(
|
||||||
|
@ -23,6 +23,7 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
@ -364,7 +365,7 @@ def main():
|
|||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -30,6 +30,7 @@ import random
|
|||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@ -406,7 +407,7 @@ def main():
|
|||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -32,6 +32,7 @@ import random
|
|||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@ -462,7 +463,7 @@ def main():
|
|||||||
# max_seq_length.
|
# max_seq_length.
|
||||||
def group_texts(examples):
|
def group_texts(examples):
|
||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
# customize this part to your needs.
|
# customize this part to your needs.
|
||||||
|
@ -22,6 +22,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@ -342,8 +343,8 @@ def main():
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Flatten out
|
# Flatten out
|
||||||
first_sentences = sum(first_sentences, [])
|
first_sentences = list(chain(*first_sentences))
|
||||||
second_sentences = sum(second_sentences, [])
|
second_sentences = list(chain(*second_sentences))
|
||||||
|
|
||||||
# Tokenize
|
# Tokenize
|
||||||
tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)
|
tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)
|
||||||
|
@ -35,6 +35,7 @@ from dataclasses import fields
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from functools import partial, wraps
|
from functools import partial, wraps
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from types import ModuleType
|
from types import ModuleType
|
||||||
from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union
|
from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union
|
||||||
@ -2129,7 +2130,7 @@ class _LazyModule(ModuleType):
|
|||||||
for value in values:
|
for value in values:
|
||||||
self._class_to_module[value] = key
|
self._class_to_module[value] = key
|
||||||
# Needed for autocompletion in an IDE
|
# Needed for autocompletion in an IDE
|
||||||
self.__all__ = list(import_structure.keys()) + sum(import_structure.values(), [])
|
self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
|
||||||
self.__file__ = module_file
|
self.__file__ = module_file
|
||||||
self.__spec__ = module_spec
|
self.__spec__ = module_spec
|
||||||
self.__path__ = [os.path.dirname(module_file)]
|
self.__path__ = [os.path.dirname(module_file)]
|
||||||
|
Loading…
Reference in New Issue
Block a user