From cd918492c694bcf4fe8f5ca403f00d1d40ae46ac Mon Sep 17 00:00:00 2001 From: radcheb Date: Tue, 3 Jan 2023 21:16:12 +0100 Subject: [PATCH] Fix race condition on cleaning checkpoints when save_total_limit set to 1 (#20989) * Update trainer.py * fix style Co-authored-by: Radhwane Chebaane --- src/transformers/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 56b621c8a7f..350af041866 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1919,8 +1919,8 @@ class Trainer: run_dir = self._get_output_dir(trial) checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) - # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint. - if self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: + # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. + if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: for checkpoint in checkpoints_sorted: if checkpoint != self.state.best_model_checkpoint: logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")