mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
fix: use mtime by default in Trainer._rotate_checkpoints with automatic fallback (#37260)
Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
This commit is contained in:
parent
6daec12d0b
commit
dac443414e
@ -3224,9 +3224,8 @@ class Trainer:
|
||||
|
||||
# Maybe delete some older checkpoints.
|
||||
if self.args.should_save:
|
||||
# Solely rely on numerical checkpoint id for rotation.
|
||||
# mtime is not reliable especially on some fuse fs in cloud environments.
|
||||
self._rotate_checkpoints(use_mtime=False, output_dir=run_dir)
|
||||
# we use mtime as default, filesystems without mtime support will be detected in `_sorted_checkpoints`
|
||||
self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
|
||||
|
||||
def _save_rng_state(self, output_dir):
|
||||
# Save RNG state in non-distributed training
|
||||
@ -4039,7 +4038,17 @@ class Trainer:
|
||||
ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
|
||||
|
||||
checkpoints_sorted = sorted(ordering_and_checkpoint_path)
|
||||
# mtime is not reliable on all filesystems, especially on some fuse fs in cloud environments
|
||||
# so we check if the mtime is fake and fallback to numerical ordering if needed
|
||||
if use_mtime and len(ordering_and_checkpoint_path) > 1:
|
||||
mtime_diff = checkpoints_sorted[-1][0] - checkpoints_sorted[0][0]
|
||||
if mtime_diff < 1.0: # less than 1 second, which is almost impossible when mtime works fine
|
||||
warnings.warn("mtime may not be reliable on this filesystem, falling back to numerical ordering")
|
||||
return self._sorted_checkpoints(
|
||||
use_mtime=False, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
|
||||
)
|
||||
checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
|
||||
|
||||
# Make sure we don't delete the best model.
|
||||
if (
|
||||
self.state.best_model_checkpoint is not None
|
||||
|
Loading…
Reference in New Issue
Block a user