Clean up staging tmp checkpoint directory (#28848)

clean up remaining tmp checkpoint dir

Signed-off-by: woshiyyya <xiaoyunxuan1998@gmail.com>
This commit is contained in:
Yunxuan Xiao 2024-02-12 07:47:21 -08:00 committed by GitHub
parent 136cd893dc
commit c617f988f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2468,6 +2468,10 @@ class Trainer:
# Solely rely on numerical checkpoint id for rotation.
# mtime is not reliable especially on some fuse fs in cloud environments.
self._rotate_checkpoints(use_mtime=False, output_dir=run_dir)
elif self.is_local_process_zero():
# Clean up the remaining staging checkpoint folders on other nodes
if staging_output_dir != output_dir and os.path.exists(staging_output_dir):
shutil.rmtree(staging_output_dir)
self.args.distributed_state.wait_for_everyone()