mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-13 17:48:22 +06:00
fix bug in group_texts function, that was inserting short batches (#23429)
* fix bug in group_texts function, that was inserting short batches * fully exclude short batches and return empty dict instead * fix style
This commit is contained in:
parent
b7b81d9344
commit
a7920065f2
@ -491,10 +491,9 @@ def main():
|
|||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
||||||
# customize this part to your needs.
|
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
||||||
if total_length >= block_size:
|
total_length = (total_length // block_size) * block_size
|
||||||
total_length = (total_length // block_size) * block_size
|
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
||||||
|
@ -434,10 +434,9 @@ def main():
|
|||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
||||||
# customize this part to your needs.
|
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
||||||
if total_length >= block_size:
|
total_length = (total_length // block_size) * block_size
|
||||||
total_length = (total_length // block_size) * block_size
|
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
||||||
|
@ -506,10 +506,9 @@ def main():
|
|||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, and if the total_length < max_seq_length we exclude this batch and return an empty dict.
|
||||||
# customize this part to your needs.
|
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
||||||
if total_length >= max_seq_length:
|
total_length = (total_length // max_seq_length) * max_seq_length
|
||||||
total_length = (total_length // max_seq_length) * max_seq_length
|
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
||||||
|
@ -472,10 +472,9 @@ def main():
|
|||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, and if the total_length < max_seq_length we exclude this batch and return an empty dict.
|
||||||
# customize this part to your needs.
|
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
||||||
if total_length >= max_seq_length:
|
total_length = (total_length // max_seq_length) * max_seq_length
|
||||||
total_length = (total_length // max_seq_length) * max_seq_length
|
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
||||||
|
@ -450,10 +450,9 @@ def main():
|
|||||||
# Concatenate all texts.
|
# Concatenate all texts.
|
||||||
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
||||||
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
# We drop the small remainder, and if the total_length < max_seq_length we exclude this batch and return an empty dict.
|
||||||
# customize this part to your needs.
|
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
||||||
if total_length >= max_seq_length:
|
total_length = (total_length // max_seq_length) * max_seq_length
|
||||||
total_length = (total_length // max_seq_length) * max_seq_length
|
|
||||||
# Split by chunks of max_len.
|
# Split by chunks of max_len.
|
||||||
result = {
|
result = {
|
||||||
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
||||||
|
Loading…
Reference in New Issue
Block a user