mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-23 14:29:01 +06:00
Fix race condition on cleaning checkpoints when save_total_limit set to 1 (#20989)
* Update trainer.py * fix style Co-authored-by: Radhwane Chebaane <rchebaane.external@epo.org>
This commit is contained in:
parent
cd2457809f
commit
cd918492c6
@ -1919,8 +1919,8 @@ class Trainer:
|
|||||||
run_dir = self._get_output_dir(trial)
|
run_dir = self._get_output_dir(trial)
|
||||||
checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
|
checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
|
||||||
|
|
||||||
# Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint.
|
# Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
|
||||||
if self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
|
if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
|
||||||
for checkpoint in checkpoints_sorted:
|
for checkpoint in checkpoints_sorted:
|
||||||
if checkpoint != self.state.best_model_checkpoint:
|
if checkpoint != self.state.best_model_checkpoint:
|
||||||
logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
|
logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
|
||||||
|
Loading…
Reference in New Issue
Block a user