mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
faster dataset building
This commit is contained in:
parent
0f92f76ca3
commit
f71a4577b8
@ -74,9 +74,8 @@ class TextDataset(Dataset):
|
||||
|
||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||
|
||||
while len(tokenized_text) >= block_size: # Truncate in block of block_size
|
||||
self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[:block_size]))
|
||||
tokenized_text = tokenized_text[block_size:]
|
||||
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
|
||||
self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[i:i+block_size]))
|
||||
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||
# can change this behavior by adding (model specific) padding.
|
||||
|
Loading…
Reference in New Issue
Block a user