Restrain tokenizer.model_max_length default (#9681)

* Restrain tokenizer.model_max_length default

* Fix indent
This commit is contained in:
Sylvain Gugger 2021-01-20 04:17:39 -05:00 committed by GitHub
parent 7e662e6a3b
commit a1ad16a446
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -338,6 +338,12 @@ def main():
if data_args.max_seq_length is None:
max_seq_length = tokenizer.model_max_length
if max_seq_length > 1024:
logger.warn(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
"Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
)
max_seq_length = 1024
else:
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warn(