Fix race condition on cleaning checkpoints when save_total_limit set to 1 (#20989)

* Update trainer.py

* fix style

Co-authored-by: Radhwane Chebaane <rchebaane.external@epo.org>
This commit is contained in:
radcheb 2023-01-03 21:16:12 +01:00 committed by GitHub
parent cd2457809f
commit cd918492c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1919,8 +1919,8 @@ class Trainer:
run_dir = self._get_output_dir(trial)
checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
# Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint.
if self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
# Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
for checkpoint in checkpoints_sorted:
if checkpoint != self.state.best_model_checkpoint:
logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")