diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index b368d86e0ed..430ec257426 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -234,8 +234,8 @@ class TrainingArguments: the last epoch before stopping training). max_steps (`int`, *optional*, defaults to -1): If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`. - In case of using a finite iterable dataset the training may stop before reaching the set number of steps - when all data is exhausted + For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until + `max_steps` is reached. lr_scheduler_type (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`): The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values. lr_scheduler_kwargs ('dict', *optional*, defaults to {}): @@ -2181,9 +2181,9 @@ class TrainingArguments: Total number of training epochs to perform (if not an integer, will perform the decimal part percents of the last epoch before stopping training). max_steps (`int`, *optional*, defaults to -1): - If set to a positive number, the total number of training steps to perform. Overrides - `num_train_epochs`. In case of using a finite iterable dataset the training may stop before reaching - the set number of steps when all data is exhausted. + If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`. + For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until + `max_steps` is reached. gradient_accumulation_steps (`int`, *optional*, defaults to 1): Number of updates steps to accumulate the gradients for, before performing a backward/update pass. @@ -2588,9 +2588,9 @@ class TrainingArguments: Total number of training epochs to perform (if not an integer, will perform the decimal part percents of the last epoch before stopping training). max_steps (`int`, *optional*, defaults to -1): - If set to a positive number, the total number of training steps to perform. Overrides - `num_train_epochs`. In case of using a finite iterable dataset the training may stop before reaching - the set number of steps when all data is exhausted. + If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`. + For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until + `max_steps` is reached. warmup_ratio (`float`, *optional*, defaults to 0.0): Ratio of total training steps used for a linear warmup from 0 to `learning_rate`. warmup_steps (`int`, *optional*, defaults to 0): diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 461c4086acc..5a13cc551b6 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -92,6 +92,8 @@ class TFTrainingArguments(TrainingArguments): Total number of training epochs to perform. max_steps (`int`, *optional*, defaults to -1): If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`. + For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until + `max_steps` is reached. warmup_ratio (`float`, *optional*, defaults to 0.0): Ratio of total training steps used for a linear warmup from 0 to `learning_rate`. warmup_steps (`int`, *optional*, defaults to 0):