Allow training to resume even if RNG states are not properly loaded (#14994)

* Allow training to resume even if RNG states are not properly loaded

* Proper f-string
This commit is contained in:
Sylvain Gugger 2021-12-30 17:03:20 -05:00 committed by GitHub
parent 08cb5718ec
commit e68c3756fe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1553,7 +1553,13 @@ class Trainer:
if self.args.local_rank != -1:
torch.cuda.random.set_rng_state(checkpoint_rng_state["cuda"])
else:
torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
try:
torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
except Exception as e:
logger.infor(
f"Didn't manage to set back the RNG states of the GPU because of the following error:\n {e}"
"\nThis won't yield the same results as if the training had not been interrupted."
)
if is_torch_tpu_available():
xm.set_rng_state(checkpoint_rng_state["xla"])