fixing error when using sharded ddp (#18435)

2025-07-30 17:52:35 +06:00 · 2022-08-03 08:39:58 +05:30 · 2022-08-03 08:39:58 +05:30 · 22a0dd2ef7
commit 22a0dd2ef7
parent 5096a654b7
1 changed files with 1 additions and 3 deletions
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@ -1344,9 +1344,8 @@ class Trainer:
                    reshard_after_forward=zero_3,
                    cpu_offload=cpu_offload,
                ).to(self.args.device)
-
        # Distributed training using PyTorch FSDP
-        if self.fsdp is not None:
+        elif self.fsdp is not None:
            # PyTorch FSDP!
            from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
            from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
@ -1394,7 +1393,6 @@ class Trainer:
                )
                if FSDPOption.OFFLOAD not in self.args.fsdp:
                    model.to(self.args.device)
-
        elif is_sagemaker_dp_enabled():
            model = nn.parallel.DistributedDataParallel(
                model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))]