diff --git a/examples/run_language_modeling.py b/examples/run_language_modeling.py index dc68a67ce80..e0a89299278 100644 --- a/examples/run_language_modeling.py +++ b/examples/run_language_modeling.py @@ -86,6 +86,9 @@ MODEL_CLASSES = { class TextDataset(Dataset): def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): assert os.path.isfile(file_path) + + block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence) + directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename @@ -195,6 +198,12 @@ def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) - def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ + + if tokenizer.mask_token is None: + raise ValueError( + "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." + ) + labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability)