fix: fsdp sharded state dict wont work for save_only_model knob (#36627)

Signed-off-by: Mehant Kammakomati <mehant.kammakomati2@ibm.com>
Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
This commit is contained in:
Mehant Kammakomati 2025-03-13 21:47:35 +05:30 committed by GitHub
parent 2a004f9ff1
commit 09a309d273
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -5175,6 +5175,12 @@ class Trainer:
raise ValueError(
"`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDP"
)
if (
self.args.save_only_model
and self.is_fsdp_enabled
and "SHARDED_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)
):
raise ValueError("save_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'")
def propagate_args_to_deepspeed(self, auto_find_batch_size=False):
"""