diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py index 309e02468cc..223b8508fbf 100644 --- a/examples/language-modeling/run_mlm.py +++ b/examples/language-modeling/run_mlm.py @@ -338,6 +338,12 @@ def main(): if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length + if max_seq_length > 1024: + logger.warn( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." + ) + max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warn(