From 569897ce2caaaafca9dfe7da378db0631f3a2e2c Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 10 Feb 2020 16:42:49 -0500
Subject: [PATCH] Fix a few issues regarding the language modeling script

---
 examples/run_language_modeling.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/run_language_modeling.py b/examples/run_language_modeling.py
index 00e9e2f1234..dc68a67ce80 100644
--- a/examples/run_language_modeling.py
+++ b/examples/run_language_modeling.py
@@ -130,9 +130,9 @@ class LineByLineTextDataset(Dataset):
         logger.info("Creating features from dataset file at %s", file_path)
 
         with open(file_path, encoding="utf-8") as f:
-            lines = [line for line in f.read().splitlines() if len(line) > 0]
+            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
 
-        self.examples = tokenizer.batch_encode_plus(lines, max_length=block_size)["input_ids"]
+        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
 
     def __len__(self):
         return len(self.examples)
@@ -704,10 +704,10 @@ def main():
         )
 
     if args.block_size <= 0:
-        args.block_size = tokenizer.max_len_single_sentence
+        args.block_size = tokenizer.max_len
         # Our input block size will be the max possible for the model
     else:
-        args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
+        args.block_size = min(args.block_size, tokenizer.max_len)
 
     if args.model_name_or_path:
         model = model_class.from_pretrained(