mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 02:31:11 +06:00
[DeepSpeed ZeRO3] Fix performance degradation in sharded models (#18911)
* [DeepSpeed] Fix performance degradation in sharded models * style * polish Co-authored-by: Stas Bekman <stas@stason.org>
This commit is contained in:
parent
10c774cf60
commit
3059d80d80
@ -421,12 +421,17 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):
|
||||
if is_deepspeed_zero3_enabled():
|
||||
import deepspeed
|
||||
|
||||
# because zero3 puts placeholders in model params, this context
|
||||
# manager gathers (unpartitions) the params of the current layer, then loads from
|
||||
# the state dict and then re-partitions them again
|
||||
with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
|
||||
if torch.distributed.get_rank() == 0:
|
||||
module._load_from_state_dict(*args)
|
||||
# In sharded models, each shard has only part of the full state_dict, so only gather
|
||||
# parameters that are in the current state_dict.
|
||||
named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
|
||||
params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
|
||||
if len(params_to_gather) > 0:
|
||||
# because zero3 puts placeholders in model params, this context
|
||||
# manager gathers (unpartitions) the params of the current layer, then loads from
|
||||
# the state dict and then re-partitions them again
|
||||
with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
|
||||
if torch.distributed.get_rank() == 0:
|
||||
module._load_from_state_dict(*args)
|
||||
else:
|
||||
module._load_from_state_dict(*args)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user