Fix a few issues regarding the language modeling script

2025-07-31 18:22:34 +06:00 · 2020-02-10 16:42:49 -05:00 · 2020-02-10 16:42:49 -05:00 · 569897ce2c
commit 569897ce2c
parent 21da895013
1 changed files with 4 additions and 4 deletions
--- a/examples/run_language_modeling.py
+++ b/examples/run_language_modeling.py
@ -130,9 +130,9 @@ class LineByLineTextDataset(Dataset):
        logger.info("Creating features from dataset file at %s", file_path)
        with open(file_path, encoding="utf-8") as f:
-            lines = [line for line in f.read().splitlines() if len(line) > 0]
+            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
-        self.examples = tokenizer.batch_encode_plus(lines, max_length=block_size)["input_ids"]
+        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
    def __len__(self):
        return len(self.examples)
@ -704,10 +704,10 @@ def main():
        )
    if args.block_size <= 0:
-        args.block_size = tokenizer.max_len_single_sentence
+        args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
-        args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
+        args.block_size = min(args.block_size, tokenizer.max_len)
    if args.model_name_or_path:
        model = model_class.from_pretrained(