fix: fsdp sharded state dict wont work for save_only_model knob (#36627)

Signed-off-by: Mehant Kammakomati <mehant.kammakomati2@ibm.com> Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
2025-07-31 18:22:34 +06:00 · 2025-03-13 21:47:35 +05:30 · 2025-03-13 21:47:35 +05:30 · 09a309d273
commit 09a309d273
parent 2a004f9ff1
1 changed files with 6 additions and 0 deletions
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@ -5175,6 +5175,12 @@ class Trainer:
            raise ValueError(
                "`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDP"
            )
+        if (
+            self.args.save_only_model
+            and self.is_fsdp_enabled
+            and "SHARDED_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)
+        ):
+            raise ValueError("save_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'")

    def propagate_args_to_deepspeed(self, auto_find_batch_size=False):
        """