From f71a4577b8a54d4a19fb1b1ca8ba15cfc6b5bb6e Mon Sep 17 00:00:00 2001 From: mgrankin Date: Thu, 26 Sep 2019 16:53:13 +0300 Subject: [PATCH] faster dataset building --- examples/run_lm_finetuning.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 7ccf4c3cb7f..8d440ebcc65 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -74,9 +74,8 @@ class TextDataset(Dataset): tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) - while len(tokenized_text) >= block_size: # Truncate in block of block_size - self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[:block_size])) - tokenized_text = tokenized_text[block_size:] + for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size + self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[i:i+block_size])) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding.