[deepspeed] offload + non-cpuadam optimizer exception (#22043)

* [deepspeed] offload + non-cpuadam optimizer exception

* flip

* revert min version
This commit is contained in:
Stas Bekman 2023-03-09 08:12:57 -08:00 committed by GitHub
parent d0c19b3303
commit ec24132b6c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -426,6 +426,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
del ds_config_dict["optimizer"] # force default HF Trainer optimizer del ds_config_dict["optimizer"] # force default HF Trainer optimizer
# force cpu offload # force cpu offload
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
ds_config_dict["zero_force_ds_cpu_optimizer"] = False # offload is not efficient w/o CPUAdam
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
kwargs = {"local_rank": 0, "deepspeed": ds_config_dict} kwargs = {"local_rank": 0, "deepspeed": ds_config_dict}
kwargs[dtype] = True kwargs[dtype] = True
@ -776,6 +777,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
ds_config_dict = self.get_config_dict(stage) ds_config_dict = self.get_config_dict(stage)
del ds_config_dict["optimizer"] # will use HF Trainer optimizer del ds_config_dict["optimizer"] # will use HF Trainer optimizer
del ds_config_dict["scheduler"] # will use HF Trainer scheduler del ds_config_dict["scheduler"] # will use HF Trainer scheduler
ds_config_dict["zero_force_ds_cpu_optimizer"] = False # offload is not efficient w/o CPUAdam
# must use this setting to get the reload path exercised # must use this setting to get the reload path exercised
ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True