fix: use mtime by default in Trainer._rotate_checkpoints with automatic fallback (#37260)

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
This commit is contained in:
Terrasse 2025-04-10 23:42:06 +08:00 committed by GitHub
parent 6daec12d0b
commit dac443414e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -3224,9 +3224,8 @@ class Trainer:
# Maybe delete some older checkpoints.
if self.args.should_save:
# Solely rely on numerical checkpoint id for rotation.
# mtime is not reliable especially on some fuse fs in cloud environments.
self._rotate_checkpoints(use_mtime=False, output_dir=run_dir)
# we use mtime as default, filesystems without mtime support will be detected in `_sorted_checkpoints`
self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
def _save_rng_state(self, output_dir):
# Save RNG state in non-distributed training
@ -4039,7 +4038,17 @@ class Trainer:
ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
checkpoints_sorted = sorted(ordering_and_checkpoint_path)
# mtime is not reliable on all filesystems, especially on some fuse fs in cloud environments
# so we check if the mtime is fake and fallback to numerical ordering if needed
if use_mtime and len(ordering_and_checkpoint_path) > 1:
mtime_diff = checkpoints_sorted[-1][0] - checkpoints_sorted[0][0]
if mtime_diff < 1.0: # less than 1 second, which is almost impossible when mtime works fine
warnings.warn("mtime may not be reliable on this filesystem, falling back to numerical ordering")
return self._sorted_checkpoints(
use_mtime=False, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
)
checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
# Make sure we don't delete the best model.
if (
self.state.best_model_checkpoint is not None