mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-25 23:38:59 +06:00
Fix steps bugs in no trainer examples (#24197)
Fix step bugs in no trainer + load checkpoint + grad acc
This commit is contained in:
parent
08ae37c820
commit
f7d80cb3d2
@ -453,10 +453,11 @@ def main():
|
|||||||
resume_step = None
|
resume_step = None
|
||||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
completed_steps = resume_step
|
completed_steps = resume_step // args.gradient_accumulation_step
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(completed_steps)
|
progress_bar.update(completed_steps)
|
||||||
|
@ -666,7 +666,7 @@ def main():
|
|||||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
completed_steps = resume_step
|
completed_steps = resume_step // args.gradient_accumulation_steps
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(completed_steps)
|
progress_bar.update(completed_steps)
|
||||||
|
@ -572,7 +572,7 @@ def main():
|
|||||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
completed_steps = resume_step
|
completed_steps = resume_step // args.gradient_accumulation_steps
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(completed_steps)
|
progress_bar.update(completed_steps)
|
||||||
|
@ -616,7 +616,7 @@ def main():
|
|||||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
completed_steps = resume_step
|
completed_steps = resume_step // args.gradient_accumulation_steps
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(completed_steps)
|
progress_bar.update(completed_steps)
|
||||||
|
@ -559,10 +559,11 @@ def main():
|
|||||||
resume_step = None
|
resume_step = None
|
||||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
completed_steps = resume_step
|
completed_steps = resume_step // args.gradient_accumulation_stepp
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(completed_steps)
|
progress_bar.update(completed_steps)
|
||||||
|
@ -811,10 +811,11 @@ def main():
|
|||||||
resume_step = None
|
resume_step = None
|
||||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
completed_steps = resume_step
|
completed_steps = resume_step // args.gradient_accumulation_stepp
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(completed_steps)
|
progress_bar.update(completed_steps)
|
||||||
|
@ -830,7 +830,7 @@ def main():
|
|||||||
resume_step = int(training_difference.replace("step_", ""))
|
resume_step = int(training_difference.replace("step_", ""))
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
completed_steps = resume_step
|
completed_steps = resume_step // args.gradient_accumulation_stepp
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(completed_steps)
|
progress_bar.update(completed_steps)
|
||||||
|
@ -556,10 +556,11 @@ def main():
|
|||||||
resume_step = None
|
resume_step = None
|
||||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
completed_steps = resume_step
|
completed_steps = resume_step // args.gradient_accumulation_stepp
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(completed_steps)
|
progress_bar.update(completed_steps)
|
||||||
|
@ -628,10 +628,11 @@ def main():
|
|||||||
resume_step = None
|
resume_step = None
|
||||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
completed_steps = resume_step
|
completed_steps = resume_step // args.gradient_accumulation_stepp
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(completed_steps)
|
progress_bar.update(completed_steps)
|
||||||
|
@ -501,10 +501,16 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step // args.gradient_accumulation_step
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(completed_steps)
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
@ -659,10 +659,16 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step // args.gradient_accumulation_stepp
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(completed_steps)
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
@ -613,7 +613,7 @@ def main():
|
|||||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
completed_steps = resume_step
|
completed_steps = resume_step // args.gradient_accumulation_stepp
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(completed_steps)
|
progress_bar.update(completed_steps)
|
||||||
|
Loading…
Reference in New Issue
Block a user