From f71a4577b8a54d4a19fb1b1ca8ba15cfc6b5bb6e Mon Sep 17 00:00:00 2001
From: mgrankin <mv.grankin@gmail.com>
Date: Thu, 26 Sep 2019 16:53:13 +0300
Subject: [PATCH] faster dataset building

---
 examples/run_lm_finetuning.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 7ccf4c3cb7f..8d440ebcc65 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -74,9 +74,8 @@ class TextDataset(Dataset):
 
             tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
 
-            while len(tokenized_text) >= block_size:  # Truncate in block of block_size
-                self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[:block_size]))
-                tokenized_text = tokenized_text[block_size:]
+            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
+                self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[i:i+block_size]))
             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
             # If your dataset is small, first you should loook for a bigger one :-) and second you
             # can change this behavior by adding (model specific) padding.