mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
🧼 remove v4.44 deprecations (#34245)
* remove v4.44 deprecations * PR comments * deprecations scheduled for v4.50 * hub version update * make fiuxp --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
parent
8d50fda644
commit
13493215ab
2
setup.py
2
setup.py
@ -117,7 +117,7 @@ _deps = [
|
||||
"fugashi>=1.0",
|
||||
"GitPython<3.1.19",
|
||||
"hf-doc-builder>=0.3.0",
|
||||
"huggingface-hub>=0.23.2,<1.0",
|
||||
"huggingface-hub>=0.24.0,<1.0",
|
||||
"importlib_metadata",
|
||||
"ipadic>=1.0.0,<2.0",
|
||||
"isort>=5.5.4",
|
||||
|
@ -24,7 +24,7 @@ deps = {
|
||||
"fugashi": "fugashi>=1.0",
|
||||
"GitPython": "GitPython<3.1.19",
|
||||
"hf-doc-builder": "hf-doc-builder>=0.3.0",
|
||||
"huggingface-hub": "huggingface-hub>=0.23.2,<1.0",
|
||||
"huggingface-hub": "huggingface-hub>=0.24.0,<1.0",
|
||||
"importlib_metadata": "importlib_metadata",
|
||||
"ipadic": "ipadic>=1.0.0,<2.0",
|
||||
"isort": "isort>=5.5.4",
|
||||
|
@ -94,7 +94,7 @@ from .utils import (
|
||||
replace_return_docstrings,
|
||||
strtobool,
|
||||
)
|
||||
from .utils.hub import convert_file_size_to_int, create_and_tag_model_card, get_checkpoint_shard_files
|
||||
from .utils.hub import create_and_tag_model_card, get_checkpoint_shard_files
|
||||
from .utils.import_utils import (
|
||||
ENV_VARS_TRUE_VALUES,
|
||||
is_sagemaker_mp_enabled,
|
||||
@ -381,92 +381,6 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi
|
||||
return False
|
||||
|
||||
|
||||
def shard_checkpoint(
|
||||
state_dict: Dict[str, torch.Tensor], max_shard_size: Union[int, str] = "10GB", weights_name: str = WEIGHTS_NAME
|
||||
):
|
||||
"""
|
||||
Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
|
||||
given size.
|
||||
|
||||
The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
|
||||
optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
|
||||
limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
|
||||
[6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will
|
||||
have a size greater than `max_shard_size`.
|
||||
|
||||
</Tip>
|
||||
|
||||
Args:
|
||||
state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save.
|
||||
max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
|
||||
The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
|
||||
(like `"5MB"`).
|
||||
weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`):
|
||||
The name of the model save file.
|
||||
"""
|
||||
logger.warning(
|
||||
"Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using "
|
||||
"split_torch_state_dict_into_shards from huggingface_hub library"
|
||||
)
|
||||
max_shard_size = convert_file_size_to_int(max_shard_size)
|
||||
|
||||
sharded_state_dicts = [{}]
|
||||
last_block_size = 0
|
||||
total_size = 0
|
||||
storage_id_to_block = {}
|
||||
|
||||
for key, weight in state_dict.items():
|
||||
# when bnb serialization is used the weights in the state dict can be strings
|
||||
# check: https://github.com/huggingface/transformers/pull/24416 for more details
|
||||
if isinstance(weight, str):
|
||||
continue
|
||||
else:
|
||||
storage_id = id_tensor_storage(weight)
|
||||
|
||||
# If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block`
|
||||
if storage_id in storage_id_to_block and weight.device != torch.device("meta"):
|
||||
block_id = storage_id_to_block[storage_id]
|
||||
sharded_state_dicts[block_id][key] = weight
|
||||
continue
|
||||
|
||||
weight_size = weight.numel() * dtype_byte_size(weight.dtype)
|
||||
# If this weight is going to tip up over the maximal size, we split, but only if we have put at least one
|
||||
# weight in the current shard.
|
||||
if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0:
|
||||
sharded_state_dicts.append({})
|
||||
last_block_size = 0
|
||||
|
||||
sharded_state_dicts[-1][key] = weight
|
||||
last_block_size += weight_size
|
||||
total_size += weight_size
|
||||
storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1
|
||||
|
||||
# If we only have one shard, we return it
|
||||
if len(sharded_state_dicts) == 1:
|
||||
return {weights_name: sharded_state_dicts[0]}, None
|
||||
|
||||
# Otherwise, let's build the index
|
||||
weight_map = {}
|
||||
shards = {}
|
||||
for idx, shard in enumerate(sharded_state_dicts):
|
||||
shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
|
||||
shard_file = shard_file.replace(
|
||||
".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
|
||||
)
|
||||
shards[shard_file] = shard
|
||||
for key in shard.keys():
|
||||
weight_map[key] = shard_file
|
||||
|
||||
# Add the metadata
|
||||
metadata = {"total_size": total_size}
|
||||
index = {"metadata": metadata, "weight_map": weight_map}
|
||||
return shards, index
|
||||
|
||||
|
||||
def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
|
||||
"""
|
||||
This is the same as
|
||||
|
@ -2203,7 +2203,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
||||
logger.warning_once(
|
||||
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
|
||||
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
|
||||
attention_mask = torch.cat(
|
||||
@ -2326,7 +2326,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
|
||||
logger.warning_once(
|
||||
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
|
||||
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
|
||||
attention_mask = torch.cat(
|
||||
|
@ -153,7 +153,7 @@ class Blip2Processor(ProcessorMixin):
|
||||
logger.warning_once(
|
||||
"Expanding inputs for image tokens in BLIP-2 should be done in processing. "
|
||||
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
|
||||
# cast to desired return tensors type
|
||||
|
@ -1471,7 +1471,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
|
||||
logger.warning_once(
|
||||
"Expanding inputs for image tokens in InstructBLIP should be done in processing. "
|
||||
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
|
||||
attention_mask = torch.cat(
|
||||
@ -1610,7 +1610,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
|
||||
logger.warning_once(
|
||||
"Expanding inputs for image tokens in InstructBLIP should be done in processing. "
|
||||
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
|
||||
attention_mask = torch.cat(
|
||||
|
@ -148,7 +148,7 @@ class InstructBlipProcessor(ProcessorMixin):
|
||||
logger.warning_once(
|
||||
"Expanding inputs for image tokens in InstructBLIP should be done in processing. "
|
||||
"Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
|
||||
# cast to desired return tensors type after concatenating
|
||||
|
@ -485,7 +485,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
|
||||
"Expanding inputs for image tokens in LLaVa should be done in processing. "
|
||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
# prefill stage vs decoding stage (legacy behavior copied)
|
||||
if input_ids.shape[1] != 1:
|
||||
|
@ -160,7 +160,7 @@ class LlavaProcessor(ProcessorMixin):
|
||||
"Expanding inputs for image tokens in LLaVa should be done in processing. "
|
||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
|
||||
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||
|
@ -868,7 +868,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
|
||||
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
|
||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
if input_ids.shape[1] != 1:
|
||||
inputs_embeds = inputs_embeds.to(image_features.dtype)
|
||||
|
@ -143,7 +143,7 @@ class LlavaNextProcessor(ProcessorMixin):
|
||||
"Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
|
||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
else:
|
||||
image_sizes = iter(image_inputs["image_sizes"])
|
||||
|
@ -21,10 +21,11 @@ import sys
|
||||
import types
|
||||
|
||||
import torch
|
||||
from huggingface_hub import split_torch_state_dict_into_shards
|
||||
from packaging import version
|
||||
|
||||
from transformers import AutoTokenizer, GPT2Config
|
||||
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint
|
||||
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
|
||||
|
||||
|
||||
def add_checkpointing_args(parser):
|
||||
@ -571,7 +572,15 @@ def convert_checkpoint_from_megatron_to_transformers(args):
|
||||
|
||||
# Store the state_dict to file.
|
||||
max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
|
||||
shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size)
|
||||
state_dict_split = split_torch_state_dict_into_shards(output_state_dict, max_shard_size=max_shard_size)
|
||||
shards = index = None
|
||||
for tensors in state_dict_split.filename_to_tensors.values():
|
||||
shards = {tensor: state_dict[tensor] for tensor in tensors}
|
||||
if state_dict_split.is_sharded:
|
||||
index = {
|
||||
"metadata": state_dict_split.metadata,
|
||||
"weight_map": state_dict_split.tensor_to_filename,
|
||||
}
|
||||
|
||||
# Save the model
|
||||
for shard_file, shard in shards.items():
|
||||
|
@ -21,10 +21,10 @@ import os
|
||||
import re
|
||||
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
from huggingface_hub import hf_hub_download, split_torch_state_dict_into_shards
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
|
||||
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint
|
||||
from transformers.modeling_utils import WEIGHTS_INDEX_NAME
|
||||
|
||||
|
||||
NUM_HIDDEN_LAYERS_MAPPING = {
|
||||
@ -116,7 +116,16 @@ def convert_rmkv_checkpoint_to_hf_format(
|
||||
state_dict = convert_state_dict(state_dict)
|
||||
|
||||
# 4. Split in shards and save
|
||||
shards, index = shard_checkpoint(state_dict)
|
||||
state_dict_split = split_torch_state_dict_into_shards(state_dict)
|
||||
shards = index = None
|
||||
for tensors in state_dict_split.filename_to_tensors.values():
|
||||
shards = {tensor: state_dict[tensor] for tensor in tensors}
|
||||
if state_dict_split.is_sharded:
|
||||
index = {
|
||||
"metadata": state_dict_split.metadata,
|
||||
"weight_map": state_dict_split.tensor_to_filename,
|
||||
}
|
||||
|
||||
for shard_file, shard in shards.items():
|
||||
torch.save(shard, os.path.join(output_dir, shard_file))
|
||||
|
||||
|
@ -578,7 +578,7 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi
|
||||
"Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
|
||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
if input_ids.shape[1] != 1:
|
||||
for features, frames in ((image_features, 1), (video_features, num_frames)):
|
||||
|
@ -149,9 +149,10 @@ class VideoLlavaProcessor(ProcessorMixin):
|
||||
if encoded_images is not None and (self.patch_size is None or self.vision_feature_select_strategy is None):
|
||||
logger.warning_once(
|
||||
"Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
|
||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
|
||||
"with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
|
||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set "
|
||||
"directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = "
|
||||
"{{vision_feature_select_strategy}}`. Using processors without these attributes in the config is "
|
||||
"deprecated and will throw an error in v4.50."
|
||||
)
|
||||
# Replace the image/video tokens with the expanded token sequence
|
||||
elif encoded_images is not None:
|
||||
|
@ -476,7 +476,7 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin)
|
||||
logger.warning_once(
|
||||
"Expanding inputs for image tokens in VipLLaVa should be done in processing. "
|
||||
"Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
|
||||
"Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
|
||||
)
|
||||
# prefill stage vs decoding stage (legacy behavior copied)
|
||||
if input_ids.shape[1] != 1:
|
||||
|
@ -105,7 +105,6 @@ if is_torch_available():
|
||||
_find_disjoint,
|
||||
_find_identical,
|
||||
dtype_byte_size,
|
||||
shard_checkpoint,
|
||||
)
|
||||
from transformers.pytorch_utils import isin_mps_friendly
|
||||
|
||||
@ -668,71 +667,6 @@ class ModelUtilsTest(TestCasePlus):
|
||||
for p1, p2 in zip(model.parameters(), new_model.parameters()):
|
||||
self.assertTrue(torch.equal(p1, p2))
|
||||
|
||||
def test_shard_checkpoint(self):
|
||||
# This is the model we will use, total size 340,000 bytes.
|
||||
model = torch.nn.Sequential(
|
||||
torch.nn.Linear(100, 200, bias=False), # size 80,000
|
||||
torch.nn.Linear(200, 200, bias=False), # size 160,000
|
||||
torch.nn.Linear(200, 100, bias=False), # size 80,000
|
||||
torch.nn.Linear(100, 50, bias=False), # size 20,000
|
||||
)
|
||||
state_dict = model.state_dict()
|
||||
|
||||
with self.subTest("No shard when max size is bigger than model size"):
|
||||
shards, index = shard_checkpoint(state_dict)
|
||||
self.assertIsNone(index)
|
||||
self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict})
|
||||
|
||||
with self.subTest("Test sharding, no weights bigger than max size"):
|
||||
shards, index = shard_checkpoint(state_dict, max_shard_size="300kB")
|
||||
# Split is first two layers then last two.
|
||||
self.assertDictEqual(
|
||||
index,
|
||||
{
|
||||
"metadata": {"total_size": 340000},
|
||||
"weight_map": {
|
||||
"0.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"1.weight": "pytorch_model-00001-of-00002.bin",
|
||||
"2.weight": "pytorch_model-00002-of-00002.bin",
|
||||
"3.weight": "pytorch_model-00002-of-00002.bin",
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]}
|
||||
shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
|
||||
self.assertDictEqual(
|
||||
shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2}
|
||||
)
|
||||
|
||||
with self.subTest("Test sharding with weights bigger than max size"):
|
||||
shards, index = shard_checkpoint(state_dict, max_shard_size="100kB")
|
||||
# Split is first layer, second layer then last 2.
|
||||
self.assertDictEqual(
|
||||
index,
|
||||
{
|
||||
"metadata": {"total_size": 340000},
|
||||
"weight_map": {
|
||||
"0.weight": "pytorch_model-00001-of-00003.bin",
|
||||
"1.weight": "pytorch_model-00002-of-00003.bin",
|
||||
"2.weight": "pytorch_model-00003-of-00003.bin",
|
||||
"3.weight": "pytorch_model-00003-of-00003.bin",
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
shard1 = {"0.weight": state_dict["0.weight"]}
|
||||
shard2 = {"1.weight": state_dict["1.weight"]}
|
||||
shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
|
||||
self.assertDictEqual(
|
||||
shards,
|
||||
{
|
||||
"pytorch_model-00001-of-00003.bin": shard1,
|
||||
"pytorch_model-00002-of-00003.bin": shard2,
|
||||
"pytorch_model-00003-of-00003.bin": shard3,
|
||||
},
|
||||
)
|
||||
|
||||
def test_checkpoint_sharding_local_bin(self):
|
||||
model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user