Fix issue with ratio evaluation steps and auto find batch size (#25436)

* Fully rebased solution

* 500
This commit is contained in:
Zach Mueller 2023-08-10 11:07:32 -04:00 committed by GitHub
parent 2d6839eaa6
commit a7da2996a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 13 deletions

View File

@ -746,7 +746,7 @@ class WandbCallback(TrainerCallback):
# keep track of model topology and gradients, unsupported on TPU
_watch_model = os.getenv("WANDB_WATCH", "false")
if not is_torch_tpu_available() and _watch_model in ("all", "parameters", "gradients"):
self._wandb.watch(model, log=_watch_model, log_freq=max(100, args.logging_steps))
self._wandb.watch(model, log=_watch_model, log_freq=max(100, state.logging_steps))
def on_train_begin(self, args, state, control, model=None, **kwargs):
if self._wandb is None:

View File

@ -1586,14 +1586,6 @@ class Trainer:
f" {args.max_steps}"
)
# Compute absolute values for logging, eval, and save if given as ratio
if args.logging_steps and args.logging_steps < 1:
args.logging_steps = math.ceil(max_steps * args.logging_steps)
if args.eval_steps and args.eval_steps < 1:
args.eval_steps = math.ceil(max_steps * args.eval_steps)
if args.save_steps and args.save_steps < 1:
args.save_steps = math.ceil(max_steps * args.save_steps)
if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
if self.args.n_gpu > 1:
# nn.DataParallel(model) replicates the model, creating new variables and module
@ -1627,6 +1619,23 @@ class Trainer:
self.state = TrainerState()
self.state.is_hyper_param_search = trial is not None
# Compute absolute values for logging, eval, and save if given as ratio
if args.logging_steps is not None:
if args.logging_steps < 1:
self.state.logging_steps = math.ceil(max_steps * args.logging_steps)
else:
self.state.logging_steps = args.logging_steps
if args.eval_steps is not None:
if args.eval_steps < 1:
self.state.eval_steps = math.ceil(max_steps * args.eval_steps)
else:
self.state.eval_steps = args.eval_steps
if args.save_steps is not None:
if args.save_steps < 1:
self.state.save_steps = math.ceil(max_steps * args.save_steps)
else:
self.state.save_steps = args.save_steps
# Activate gradient checkpointing if needed
if args.gradient_checkpointing:
self.model.gradient_checkpointing_enable()

View File

@ -53,6 +53,12 @@ class TrainerState:
During training, represents the number of update steps completed.
max_steps (`int`, *optional*, defaults to 0):
The number of update steps to do during the current training.
logging_steps (`int`, *optional*, defaults to 500):
Log every X updates steps
eval_steps (`int`, *optional*):
Run an evaluation every X steps.
save_steps (`int`, *optional*, defaults to 500):
Save checkpoint every X updates steps.
total_flos (`float`, *optional*, defaults to 0):
The total number of floating operations done by the model since the beginning of training (stored as floats
to avoid overflow).
@ -77,6 +83,9 @@ class TrainerState:
epoch: Optional[float] = None
global_step: int = 0
max_steps: int = 0
logging_steps: int = 500
eval_steps: int = 500
save_steps: int = 500
num_train_epochs: int = 0
total_flos: float = 0
log_history: List[Dict[str, float]] = None
@ -421,13 +430,13 @@ class DefaultFlowCallback(TrainerCallback):
# Log
if state.global_step == 1 and args.logging_first_step:
control.should_log = True
if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % args.logging_steps == 0:
if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % state.logging_steps == 0:
control.should_log = True
# Evaluate
if (
args.evaluation_strategy == IntervalStrategy.STEPS
and state.global_step % args.eval_steps == 0
and state.global_step % state.eval_steps == 0
and args.eval_delay <= state.global_step
):
control.should_evaluate = True
@ -435,8 +444,8 @@ class DefaultFlowCallback(TrainerCallback):
# Save
if (
args.save_strategy == IntervalStrategy.STEPS
and args.save_steps > 0
and state.global_step % args.save_steps == 0
and state.save_steps > 0
and state.global_step % state.save_steps == 0
):
control.should_save = True